This dataset has been downloaded from UC Irvine Machine Learning Repository.
https://archive.ics.uci.edu/ml/datasets/Credit+Approval
This dataset is regarding credit card applications.
The target variable/label is whether the application has been granted credit or not.
All attribute names and values have been changed to meaningless symbols to protect
confidentiality of the data.
The objective is here to build a model to give binary output based on the input attributes.
Summary of Key information
Number of Instances/training examples : 690
Number of Instances with missing attributes : 37
Number of qualified Instances/training examples : 653
Number of Input Attributes : 15
Number of categorical attributes : 9
Number of numerical attributes : 6
Target Attribute Type : Binary Class
Target Class distribution : 54%:45%
Problem Identification : Binary Classification with balanced data set
import os
print(os.environ['PATH'])
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'limit_output': 20})
import numpy as np
import pandas as pd
import time
import seaborn as sns
import matplotlib.pyplot as plt
from eda import eda_overview, axes_utils
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, learning_curve, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import recall_score, precision_score, accuracy_score,confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score
pd.set_option('display.max_rows', 20)
pd.set_option('precision', 4)
path = "/Users/bhaskarroy/BHASKAR FILES/BHASKAR CAREER/Data Science/Practise/" \
"Python/UCI Machine Learning Repository/Credit Screening/"
# Index
# credit.lisp
# credit.names
# crx.data
# crx.names
path1 = path + "crx.data"
path_name = path + "credit.names"
path_crxname = path + "crx.names"
datContent = [i.strip().split() for i in open(path1).readlines()]
len(datContent)
print(dir(type(datContent[0][0])))
# Inspecting the contents
print(datContent[0][0].split(sep = ","))
len(datContent[0])
# Opening the file credit.names for the description of data set
with open(path_name) as f:
print(f.read())
# Opening the file crx.names for the description of data set
with open(path_crxname) as f:
print(f.read())
with open(path+"Index") as f:
print(f.read())
#with open(path+"credit.lisp") as f:
# print(f.read())
Following actions were undertaken:
# Inspecting the data
# We find that all the elements in a row is fused as one element.
# We need to use comma for splitting
datContent[0:5]
# Splitting using comma to get individual elements
print(datContent[0][0].split(sep = ","))
# The Number of attributes/features is 16
attrCount = len(datContent[0][0].split(sep = ","))
attrCount
# As all features names have been changed/anonymised,
# we will create standard feature name starting with 'A' and suffixed with feature number
colNames = ["A"+str(i+1) for i in range(attrCount)]
print(colNames)
# Extracting values/data that will be passed as data to create the Dataframe
rawData = []
for i in datContent:
for j in i:
rawData.append(j.split(sep = ","))
# Creating the Dataframe
df = pd.DataFrame(rawData, columns = colNames)
# Inspecting the Dataframe
df.head()
# Inspecting the dataframe
# We find that features 'A2','A16' have symbols that would require further preprocessing
df.describe()
# Checking the datatypes to decide the datatype conversions required feature wise
df.info()
#df['A2'].astype("float")
df1 = df[(df == "?").any(axis = 1)]
df1
# Selecting a subset without any missing values
df2 = df[(df != "?").all(axis = 1)]
df2.shape
df2.head()
# Below code may return Setting with Copy warning
# Use df._is_view to check if a dataframe is a view or copy
# df2.loc[:, 'A16'] = df2['A16'].map({"-": 0, "+":1}).values
# Use df.assign instead.
# https://stackoverflow.com/questions/36846060/how-to-replace-an-entire-column-on-pandas-dataframe
df2 = df2.assign(A16 = df2['A16'].map({"-": 0, "+":1}))
df2
df2
from eda import datasets
datasets.credit_screening()
# Continous Variables are A2, A3, A11, A14, A15
contAttr = ['A2', 'A3','A8', 'A11', 'A14', 'A15']
for i in contAttr:
df2.loc[:,i] = df2[i].astype("float")
df2
Findings from the distribution of numeric variables at overall level and considering the application status are as below:
eda_overview.UVA_numeric(data = df2, var_group = contAttr)
# Apply the default theme
sns.set_theme()
t = eda_overview.UVA_numeric_classwise(df2, 'A16', ['A16'],
colcount = 3, colwidth = 3,
rowheight = 3,
plot_type = 'histogram', element = 'step')
plt.gcf().savefig(path+'Numeric_interaction_class.png', dpi = 150)
t = eda_overview.distribution_comparison(df2, 'A16',['A16'])[0]
t
t.to_csv(path +'NumericDistributionComparison.csv')
# Inspecting number of unique values
df2[contAttr].nunique()
Findings from the correlation plot are as below :
# Continous Variables are A2, A3, A11, A14, A15
contAttr = ['A2', 'A3','A8', 'A11', 'A14', 'A15']
# Target Variable is A16
targetAttr = ['A16']
df2[contAttr+targetAttr]
# Bivariate analysis at overall level
plt.rcdefaults()
#sns.set('notebook')
#sns.set_theme(style = 'whitegrid')
sns.set_context(font_scale = 0.6)
from pandas.plotting import scatter_matrix
scatter_matrix(df2[contAttr+targetAttr], figsize = (12,8));
# Bivariate analysis taking into account the target categories
#sns.set('notebook')
sns.set_theme(style="darkgrid")
sns.pairplot(df2[contAttr+targetAttr],hue= 'A16',height = 1.5)
df2[contAttr+targetAttr].dtypes
# Correlation table
df2[contAttr].corr()
# Heatmap for correlation of numeric attributes
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(df2[contAttr].corr(), annot = True, ax = ax, annot_kws={"fontsize":10});
# Correlation matrix for customers not granted credit
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(df2[df2['A16'] == 0][contAttr].corr(), ax = ax, annot_kws={"fontsize":10}, annot = True);
# Correlation matrix for customers granted credit
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(df2[df2['A16'] == 1][contAttr].corr(),ax = ax,
annot_kws={"fontsize":10}, annot = True);
# Continous Variables are A2, A3, A8, A11, A14, A15
# Categorical Input Variables are A1, A4, A5, A6, A7, A9, A10, A12, A13
# Target Variable is A16 and is categorical.
catAttr = ["A1","A4", "A5", "A6", "A7", "A9", "A10", "A12", "A13"]
eda_overview.UVA_category(df2, var_group = catAttr + targetAttr,
colwidth = 3,
rowheight = 2,
colcount = 2,
spine_linewidth = 0.2,
nspaces = 4, ncountspaces = 3,
axlabel_fntsize = 7,
ax_xticklabel_fntsize = 7,
ax_yticklabel_fntsize = 7,
change_ratio = 0.6,
infofntsize = 7)
Dataset is balanced as the ratio of the binary classes is ~55:45.
We can use Accuracy as a Evaluation metric for the classifier model.
plt.figure(figsize = (4,3), dpi = 100)
ax = sns.countplot(x = 'A16', data = df2, )
ax.set_ylim(0, 1.1*ax.get_ylim()[1])
axes_utils.Add_data_labels(ax.patches)
axes_utils.Change_barWidth(ax.patches, 0.8)
axes_utils.Add_valuecountsinfo(ax, 'A16',df2)
X, y = df2.drop(targetAttr, axis = 1), df2['A16']
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))
X_train.head()
# Creating numeric Pipeline for standard scaling of numeric features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
('std_scaler', StandardScaler())])
from sklearn import set_config
set_config(display="diagram")
num_pipeline
df2_num_tr = num_pipeline.fit_transform(df2[contAttr])
pd.DataFrame(df2_num_tr)
# Transforming the X_train and X_test
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# Segregating the numeric and categorical features
num_attribs = ['A2', 'A3','A8', 'A11', 'A14', 'A15']
cat_attribs = ["A1","A4", "A5", "A6", "A7", "A9", "A10", "A12", "A13"]
# Creating Column Transformer for selectively applying tranformations
# both standard scaling and one hot encoding
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)])
# Creating Column Transformer for selectively applying tranformations
# only one hot encoding and no standard scaling
categorical_pipeline = ColumnTransformer([
("num_selector", "passthrough", num_attribs),
("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)])
# Displaying the full_pipeline
from sklearn import set_config
set_config(display="diagram")
full_pipeline
# Displaying the categorical_pipeline
from sklearn import set_config
set_config(display="diagram")
categorical_pipeline
# Learning the parameters for transforming from train set using full_pipeline
# Transforming both train and test set
X_train_tr1 = full_pipeline.fit_transform(X_train)
X_test_tr1 = full_pipeline.transform(X_test)
# Learning the parameters for transforming from train set using categorical pipeline
# Transforming both train and test set
X_train_tr2 = categorical_pipeline.fit_transform(X_train)
X_test_tr2 = categorical_pipeline.transform(X_test)
# Transforming the target variable
from sklearn.preprocessing import LabelEncoder
# prepare input data
def prepare_targets(y_train, y_test):
le = LabelEncoder()
le.fit(np.ravel(y_train))
y_train_enc = le.transform(np.ravel(y_train))
y_test_enc = le.transform(np.ravel(y_test))
return y_train_enc, y_test_enc
y_train_tr, y_test_tr = prepare_targets(y_train, y_test)
# Function for returning a string containing
# Classification report and the accuracy, precision, recall and F1 measures on train and test data
# The average parameter for the measures is 'macro' as the minority class is of importance.
from sklearn.metrics import recall_score, precision_score, accuracy_score, \
confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, \
roc_curve, auc
def evaluation_parametrics(y_train,yp_train,y_test,yp_test,average_param = 'weighted'):
'''
average_param : values can be 'weighted', 'micro', 'macro'.
Check link:
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score
'''
d = 2
txt = "-"*60 \
+ "\nClassification Report for Train Data\n" \
+ classification_report(y_train, yp_train) \
+ "\nClassification Report for Test Data\n" \
+ classification_report(y_test, yp_test) \
+ "\n" + "-"*60 + "\n" \
+ "Accuracy on Train Data is: {}".format(round(accuracy_score(y_train,yp_train),d)) \
+ '\n' \
+ "Accuracy on Test Data is: {}".format(round(accuracy_score(y_test,yp_test),d)) \
+ "\n" + "-"*60 + "\n" \
+ "Precision on Train Data is: {}".format(round(precision_score(y_train,yp_train,average = average_param),d)) \
+ "\n" \
+ "Precision on Test Data is: {}".format(round(precision_score(y_test,yp_test,average = average_param),d)) \
+ "\n" + "-"*60 + "\n" \
+ "Recall on Train Data is: {}".format(round(recall_score(y_train,yp_train,average = average_param),d)) \
+ "\n" \
+ 'Recall on Test Data is: {}'.format(round(recall_score(y_test,yp_test,average = average_param),d)) \
+ "\n" + "-"*60 + "\n" \
+ "F1 Score on Train Data is: {}".format(round(f1_score(y_train,yp_train,average = average_param),d)) \
+ "\n" \
+ "F1 Score on Test Data is: {}".format(round(f1_score(y_test,yp_test,average = average_param),d)) \
+ "\n" + "-"*60 + "\n"
return txt
def Confusion_matrix_ROC_AUC(name, alias, pipeline,
X_train_tr, y_train_tr,
X_test_tr,y_test_tr):
'''
This function reurns three plots :
- Confusion matrix on testset predictions
- Classification report for performance on train and test set
- roc and auc curve for test set predictions
The arguments are :
name : short/terse name for the composite estimator
alias : descriptive name for the composite estimator
pipeline : Composite estimator
X_train_tr, y_train_tr : train set feature matrix, train set target
X_test_tr,y_test_tr : test set feature matrix, test set target
For reference, below is a list containing the tuple of (name, alias, pipeline)
[('SGD', 'Stochastic Gradient Classifier',SGDClassifier(random_state=42)),
('LR','Logistic Regression Classifier', LogisticRegression(max_iter = 1000,random_state = 48)),
('RF','Random Forest Classifier', RandomForestClassifier(max_depth=2, random_state=42)),
('KNN','KNN Classifier',KNeighborsClassifier(n_neighbors = 7)),
('NB','Naive Bayes Classifier', GaussianNB()),
('SVC','Support Vector Classifier',
`LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-4, C=0.1)),
('CART', 'CART', DecisionTreeClassifier(max_depth = 7,random_state = 48)),
('GBM','Gradient Boosting Classifier',
GradientBoostingClassifier(n_estimators=50, max_depth=10)),
('LDA', 'LDA Classifier', LinearDiscriminantAnalysis())]
For instance, if the classifier is an SGDClassifier, the suggested name and alias are :
'SGD'/'sgdclf', 'Stochastic Gradient Classifier'.
It is recommended to adhere to name and alias conventions,
- as the name argument is used to for checking whether calibrated Classifier CV is required or not.
- as the alias argument will be used as title in the Classification report plot.
Call the functions : evaluation_parametrics
Check the links :
https://peps.python.org/pep-0008/#maximum-line-length
https://scikit-learn.org/stable/glossary.html#term-predict_proba
Example :
>> Confusion_matrix_ROC_AUC('sgd_clf','Stochastic Gradient Classifier',sgd_clf,
X_train_tr, y_train_tr, X_test_tr,y_test_tr)
'''
from sklearn.metrics import recall_score, precision_score, accuracy_score, \
confusion_matrix, ConfusionMatrixDisplay, classification_report, f1_score, \
roc_curve, auc
from sklearn.calibration import CalibratedClassifierCV
fig = plt.figure(figsize=(10,5), dpi = 130)
gridsize = (2, 3)
ax1 = plt.subplot2grid(gridsize, (0, 0), colspan=1, rowspan=1)
ax2 = plt.subplot2grid(gridsize, (0, 1), colspan = 2, rowspan = 2)
ax3 = plt.subplot2grid(gridsize, (1, 0), colspan = 1)
sns.set(font_scale=0.75) # Adjust to fit
#---------------------------------------------------------------------------------
# Displaying the confusion Matrix
#ax1 = fig.add_subplot(1,3,2)
# Fitting the model
model = pipeline
model.fit(X_train_tr, y_train_tr)
# Predictions on train and test set
yp_train_tr = model.predict(X_train_tr)
yp_test_tr = model.predict(X_test_tr)
# Creating the confusion matrix for test set results
cm = confusion_matrix(y_test_tr, yp_test_tr, labels= pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels= pipeline.classes_)
ax1.grid(False)
disp.plot(ax = ax1)
ax1.set_title('Confusion Matrix on testset pred')
#---------------------------------------------------------------------------------
# Displaying the evaluation results that include the classification report
#ax2 = fig.add_subplot(1,3,2)
eval_results = (str(alias) \
+'\n' \
+ evaluation_parametrics(y_train_tr,yp_train_tr,
y_test_tr,yp_test_tr))
ax2.annotate(xy = (0,1), text = eval_results, size = 8,
ha = 'left', va = 'top', font = 'Andale Mono')
ax2.patch.set(visible = False)
ax2.tick_params(top=False, bottom=False, left=False, right=False,
labelleft=False, labelbottom=False)
#ax2.ticks.off
#---------------------------------------------------------------------------------
# Displaying the ROC AUC curve
import re
pattern = re.compile('(sgd|SGD|SVC)')
if re.search(pattern, name) :
print('Calibrated Classifier CV needed.')
#base_model = SGDClassifier()
model = CalibratedClassifierCV(pipeline)
else :
print('Calibrated Classifier CV not needed.')
model = pipeline
# Fitting the model
model.fit(X_train_tr, y_train_tr)
#https://scikit-learn.org/stable/glossary.html#term-predict_proba
preds = model.predict_proba(X_test_tr)
pred = pd.Series(preds[:,1])
fpr, tpr, thresholds = roc_curve(y_test_tr, pred)
auc_score = auc(fpr, tpr)
label='%s: auc=%f' % (name, auc_score)
ax3.plot(fpr, tpr, linewidth=1)
ax3.fill_between(fpr, tpr, label = label, linewidth=1, alpha = 0.1, ec = 'black')
ax3.plot([0, 1], [0, 1], 'k--') #x=y line.
ax3.set_xlim([0.0, 1.0])
ax3.set_ylim([0.0, 1.05])
ax3.set_xlabel('False Positive Rate')
ax3.set_ylabel('True Positive Rate')
ax3.set_title('ROC curve')
ax3.legend(loc = 'lower right')
fig.tight_layout()
plt.show()
return fig
# Creating a dictionary to store the performace measures on train and test data
# Note the precion, recall and F1 score measures are weighted averages taking into consideration the class sizes
def create_dict(model, modelname, y_train, yp_train, y_test, yp_test, average_param = 'weighted'):
'''
average_param : values can be 'weighted', 'micro', 'macro'.
Check link:
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
#sklearn.metrics.precision_score
'''
d = 4
dict1 = {modelname : {"Accuracy":{"Train": float(np.round(accuracy_score(y_train,yp_train), d)),
"Test": float(np.round(accuracy_score(y_test,yp_test),d))},
"F1" : {"Train":
float(np.round(f1_score(y_train,yp_train,average = average_param),d)),
"Test":
float(np.round(f1_score(y_test,yp_test,average = average_param),d))},
"Recall": {"Train":
float(np.round(recall_score(y_train,yp_train,average = average_param),d)),
"Test":
float(np.round(recall_score(y_test,yp_test,average = average_param),d))},
"Precision" :{"Train":
float(np.round(precision_score(y_train,yp_train,average = average_param),d)),
"Test":
float(np.round(precision_score(y_test,yp_test,average = average_param),d))
}}
}
return dict1
dict_perf = {}
# Display the performance measure outputs for all the classifiers
# unpacking the dictionary to dataframe
def display_results(dict_perf):
pd.set_option('precision', 4)
user_ids = []
frames = []
for user_id, d in dict_perf.items():
user_ids.append(user_id)
frames.append(pd.DataFrame.from_dict(d, orient='columns'))
df = pd.concat(frames, keys=user_ids)
df = df.unstack(level = -1)
return df
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
name = 'sgdclf'
sgd_clf = SGDClassifier(random_state=42)
st_time = time.time()
sgd_clf.fit(X_train_tr1, y_train_tr)
yp_train_tr = sgd_clf.predict(X_train_tr1)
yp_test_tr = sgd_clf.predict(X_test_tr1)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr,average_param = 'weighted'))
dict1 = create_dict(sgd_clf, "SGD Classifier",
y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
sns.set(font_scale=0.75)
fig = Confusion_matrix_ROC_AUC('sgd_clf','Stochastic Gradient Classifier',
sgd_clf, X_train_tr1, y_train_tr,X_test_tr1,y_test_tr)
fig.savefig(path+'StochasticGradientClassifier.png', dpi = 150)
lr = LogisticRegression(max_iter = 1000,random_state = 48)
st_time = time.time()
lr.fit(X_train_tr1, y_train_tr)
yp_train_tr = lr.predict(X_train_tr1)
yp_test_tr = lr.predict(X_test_tr1)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(lr, "Logistic Regression Classifier", y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('lr','Logistic Regression Classifier',lr, X_train_tr1, y_train_tr,X_test_tr1,y_test_tr)
rf_clf = RandomForestClassifier(max_depth=5, random_state=42)
st_time = time.time()
rf_clf.fit(X_train_tr2, y_train_tr)
yp_train_tr = rf_clf.predict(X_train_tr2)
yp_test_tr = rf_clf.predict(X_test_tr2)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(rf_clf, "Random Forest Classifier",
y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('rf_clf','Random Forest Classifier',rf_clf,
X_train_tr2, y_train_tr,X_test_tr2,y_test_tr)
# training a KNN classifier
knn_clf = KNeighborsClassifier(n_neighbors = 7)
st_time = time.time()
knn_clf.fit(X_train_tr1, y_train_tr)
yp_train_tr = knn_clf.predict(X_train_tr1)
yp_test_tr = knn_clf.predict(X_test_tr1)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(knn_clf, "k-Nearest Neighbor Classifier", y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('knn_clf', "k-Nearest Neighbor Classifier",knn_clf,
X_train_tr1, y_train_tr,X_test_tr1,y_test_tr)
# training a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()
st_time = time.time()
gnb_clf.fit(X_train_tr2, y_train_tr)
yp_train_tr = gnb_clf.predict(X_train_tr2)
yp_test_tr = gnb_clf.predict(X_test_tr2)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(gnb_clf, "Gaussian Naive Bayes Classifier", y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('gnb_clf', "Gaussian Naive Bayes Classifier",gnb_clf,
X_train_tr2, y_train_tr,X_test_tr2,y_test_tr)
svm = LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-4, C=0.1)
st_time = time.time()
svm.fit(X_train_tr1,y_train_tr)
yp_train_tr = svm.predict(X_train_tr1)
yp_test_tr = svm.predict(X_test_tr1)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(svm, "Support Vector Classifier", y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('LinearSVC', "Support Vector Classifier",svm,
X_train_tr1, y_train_tr,X_test_tr1,y_test_tr)
dt = DecisionTreeClassifier(max_depth = 5,random_state = 48)
# Keeping max_depth = 7 to avoid overfitting
dt.fit(X_train_tr2,y_train_tr)
yp_train_tr = dt.predict(X_train_tr2)
yp_test_tr = dt.predict(X_test_tr2)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(dt, "Decision Tree Classifier", y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('dt', "Decision Tree Classifier", dt,
X_train_tr2, y_train_tr,X_test_tr2,y_test_tr)
gb_model = GradientBoostingClassifier(n_estimators=50, max_depth=5)
st_time = time.time()
gb_model.fit(X_train_tr2,y_train_tr)
yp_train_tr = gb_model.predict(X_train_tr2)
yp_test_tr = gb_model.predict(X_test_tr2)
en_time = time.time()
print('Total time: {:.2f}s'.format(en_time-st_time))
#print(evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr))
dict1 = create_dict(gb_model, "Gradient Boosting Classifier", y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('gb_model', "Gradient Boosting Classifier", gb_model,
X_train_tr2, y_train_tr,X_test_tr2,y_test_tr)
lda_model = LinearDiscriminantAnalysis()
st_time = time.time()
lda_model.fit(X_train_tr1,y_train_tr)
yp_train_tr = lda_model.predict(X_train_tr1)
yp_test_tr = lda_model.predict(X_test_tr1)
en_time = time.time()
#print('Total time: {:.2f}s'.format(en_time-st_time))
evaluation_parametrics(y_train_tr,yp_train_tr,y_test_tr,yp_test_tr)
dict1 = create_dict(lda_model, "Linear Discriminant Analysis Classifier",
y_train_tr, yp_train_tr, y_test_tr, yp_test_tr)
dict_perf.update(dict1)
fig = Confusion_matrix_ROC_AUC('lda_model', "Linear Discriminant Analysis Classifier", lda_model,
X_train_tr1, y_train_tr,X_test_tr1,y_test_tr)
from matplotlib import pyplot as plt
import sklearn
from sklearn.metrics import roc_curve, auc
#from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# name -> (line format, classifier)
CLASS_MAP = {
'LogisticRegression':('-', LogisticRegression()),
'Naive Bayes': ('--', GaussianNB()),
'Decision Tree':('.-', DecisionTreeClassifier(max_depth=5)),
'Random Forest':(':', RandomForestClassifier( max_depth=5, n_estimators=10, max_features=1)),
}
# Divide cols by independent/dependent, rows by test/ train
#X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test, Y_train, Y_test = \
#train_test_split(X, Y, test_size=.8)
#X_train_tr, y_train_tr,X_test_tr,y_test_tr
for name, (line_fmt, model) in CLASS_MAP.items():
model.fit(X_train_tr1, y_train_tr)
# array w one col per label
preds = model.predict_proba(X_test_tr1)
pred = pd.Series(preds[:,1])
fpr, tpr, thresholds = roc_curve(y_test_tr, pred)
auc_score = auc(fpr, tpr)
label='%s: auc=%f' % (name, auc_score)
plt.plot(fpr, tpr, line_fmt,
linewidth=1, label=label)
plt.legend(loc="lower right")
plt.title('Comparing Classifiers')
display_results(dict_perf).style.background_gradient(cmap='Blues')
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'accuracy'
# Creating ColumnTransformer for preprocessing the training data folds and testing data fold within the k-fold
# Note that training data folds will be fitted and transformed
# The test data folds will be transformed
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
num_attribs = ['A2', 'A3','A8', 'A11', 'A14', 'A15']
cat_attribs = ["A1","A4", "A5", "A6", "A7", "A9", "A10", "A12", "A13"]
preprocessor1 = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)])
preprocessor2 = ColumnTransformer([("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)],
remainder = 'passthrough')
# Creating list of classifier models
models = [('SGD',SGDClassifier(random_state=42)),
('LR',LogisticRegression(max_iter = 1000,random_state = 48)),
('RF',RandomForestClassifier(max_depth=2, random_state=42)),
('KNN',KNeighborsClassifier(n_neighbors = 7)),
('NB',GaussianNB()),
('SVC',LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-4, C=0.1)),
('CART',DecisionTreeClassifier(max_depth = 7,random_state = 48)),
('GBM',GradientBoostingClassifier(n_estimators=50, max_depth=10)),
('LDA',LinearDiscriminantAnalysis())
]
# Creating the pipeline of model and the preprocessor for feeding into the k-fold crossvalidation
pipelines_list = []
for i in models:
if i[0] not in ['RF','CART', 'GBM']:
pipelines_list.append(('Scaled'+str(i[0]), Pipeline([('Preprocessor1', preprocessor1),i])))
else:
pipelines_list.append((str(i[0]), Pipeline([('Preprocessor2', preprocessor2),i])))
# Checking the pipeline
pipelines_list
# Evaluating the Algorithms
results = []
names = []
st_time = time.time()
for name, pipeline in pipelines_list:
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle = True)
cv_results = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
print(type(pipeline))
results.append(cv_results)
names.append(name)
msg = "{:<10}: {:<6} ({:^6})".format(name, cv_results.mean().round(4), cv_results.std().round(4))
print(msg)
print(X_train)
en_time = time.time()
print('Total time: {:.4f}s'.format(en_time-st_time))
tmp = pd.DataFrame(results).transpose()
tmp.columns = names
tmp
tmp.mean()
print('The top 4 algorithms based on crossvalidation performance are :')
for alg, value in tmp.mean().sort_values(ascending = False)[0:4].iteritems():
print('{: <20} : {: 1.4f}'.format(alg, value))
# from cross validation using various classifier models
tmp1 = pd.concat([tmp.mean(), tmp.std()], axis = 1, keys = ['mean','std_dev'])
tmp1.style.background_gradient(cmap = 'Blues')
tmp1['mean'].idxmax(axis = 'columns')
np.argsort(tmp1['mean'])
# Understanding of np.argsort or df['col'].argsort()
n = 2
algorithm_index = tmp1['mean'].index.to_list()
top_n_idx = tmp1['mean'].argsort()[::-1][:n].values
top2_algorithms = [algorithm_index[i] for i in top_n_idx]
top2_algorithms
top_n_idx
n = 2
tmp1['mean'].argsort()[::-1][n]
n = 2
avgDists = np.array([1, 8, 6, 9, 4])
ids = avgDists.argsort()
ids
type(tmp1['mean'].argsort()[0])
# Compare Algorithms
plt.rcdefaults()
fig = plt.figure(figsize = (6,3))
ax = fig.add_subplot(111)
sns.boxplot(data = tmp, color = 'lightgrey', linewidth = 1, width = 0.5, orient = 'h')
# Coloring box-plots of top 2 mean values
n = 2
algorithm_index = tmp1['mean'].index.to_list()
top_2_idx = tmp1['mean'].argsort()[::-1][:n].values
for i in top_2_idx :
# Select which box you want to change
mybox = ax.patches[i]
# Change the appearance of that box
mybox.set_facecolor('salmon')
mybox.set_alpha(0.8)
# mybox.set_edgecolor('black')
# mybox.set_linewidth(3)
# Coloring box-plots of 3rd and 4th mean values
top_3_4_idx = tmp1['mean'].argsort()[::-1][2:4].values
for i in top_3_4_idx :
# Select which box you want to change
mybox = ax.patches[i]
# Change the appearance of that box
mybox.set_facecolor('mediumturquoise')
mybox.set_alpha(0.7)
# mybox.set_edgecolor('black')
# mybox.set_linewidth(3)
ax.grid(True, alpha = 0.4, ls = '--')
ax.set_axisbelow(True)
[labels.set(rotation = 20, ha = 'right') for labels in ax.get_xticklabels()]
[labels.set(size = 8) for labels in ax.get_yticklabels()]
for key, _ in ax.spines._dict.items():
ax.spines._dict[key].set_linewidth(.5)
ax.set_title('Algorithm Comparison using 10-fold CV scores', ha = 'center' )
ax.set_xlabel('CV score')
#ax.set_ylim(0.6,1)
plt.show()
fig.savefig(path+'spotchecking algorithms using 10 fold CV.png', dpi = 175)
display_results(dict_perf).style.background_gradient(cmap='Blues')
From both simple train/test/split and cross validation methods, we have shortlisted the below two classifiers with the highest accuracy measures :
-- Logistic Regression Classifier and
-- Random Forest Classifier
# https://stackoverflow.com/questions/62331674/sklearn-combine-gridsearchcv-with-column-transform-and-pipeline?noredirect=1&lq=1
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
import pandas as pd # doctest: +SKIP
# define dataset
X, y = X_train, y_train
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold, RepeatedStratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
#numerical_features=make_column_selector(dtype_include=np.number)
#cat_features=make_column_selector(dtype_exclude=np.number)
num_attribs = ['A2', 'A3','A8', 'A11', 'A14', 'A15']
cat_attribs = ["A1","A4", "A5", "A6", "A7", "A9", "A10", "A12", "A13"]
# Setting the pipeline for preprocessing numeric and categorical variables
preprocessor = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)])
# Creating a composite estimator by appending classifier estimator to preprocessor pipeline
model_lr = make_pipeline(preprocessor,
LogisticRegression(random_state = 48, max_iter = 1000) )
# define models and parameters
#model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid_lr = dict(logisticregression__solver=solvers,
logisticregression__penalty=penalty,
logisticregression__C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# Instantiating GridSearchCV object with composite estimator, parameter grid, CV(crossvalidation generator)
grid_search_lr = GridSearchCV(estimator=model_lr, param_grid=grid_lr,
n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result_lr = grid_search_lr.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result_lr.best_score_, grid_result_lr.best_params_))
means = grid_result_lr.cv_results_['mean_test_score']
stds = grid_result_lr.cv_results_['std_test_score']
params = grid_result_lr.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
set_config(display='text')
grid_result_lr.best_estimator_
set_config(display='diagram')
grid_search_lr.best_estimator_
# https://stackoverflow.com/questions/62331674/sklearn-combine-gridsearchcv-with-column-transform-and-pipeline?noredirect=1&lq=1
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
import pandas as pd # doctest: +SKIP
# define dataset
X, y = X_train, y_train
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
#numerical_features=make_column_selector(dtype_include=np.number)
#cat_features=make_column_selector(dtype_exclude=np.number)
num_attribs = ['A2', 'A3','A8', 'A11', 'A14', 'A15']
cat_attribs = ["A1","A4", "A5", "A6", "A7", "A9", "A10", "A12", "A13"]
preprocessor = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(handle_unknown='ignore'), cat_attribs)])
model_rf = make_pipeline(preprocessor, RandomForestClassifier())
# define model parameters
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid_rf = dict(randomforestclassifier__n_estimators=n_estimators,
randomforestclassifier__max_features=max_features
)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=grid_rf, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result_rf = grid_search_rf.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result_rf.best_score_, grid_result_rf.best_params_))
means = grid_result_rf.cv_results_['mean_test_score']
stds = grid_result_rf.cv_results_['std_test_score']
params = grid_result_rf.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
Since, refit = True in grid_search_lr (The GridSearchCV for logistic Regression),
the best estimator - grid_result_lr.bestestimator is the final model that is trained on the entire model.
We will use the trained model to make predictions and assess performance on both training and test sets.
# Inspecting the hyperparameters of the tuned estimator
set_config(display='text')
grid_result_lr.best_estimator_
final_estimator = grid_result_lr.best_estimator_.named_steps['logisticregression']
print(final_estimator)
print('\n')
print('Coefficients of Logistic Regression Model : \n {}'.format(final_estimator.coef_))
final_model = grid_result_lr.best_estimator_
X_train_prepared = final_model['columntransformer'].transform(X_train)
train_predictions = final_model.named_steps['logisticregression'].predict(X_train_prepared)
# Testing the final model on test set
X_test, y_test = X_test, y_test
X_test_prepared = final_model['columntransformer'].transform(X_test)
final_predictions = final_model['logisticregression'].predict(X_test_prepared)
evaluation_parametrics(y_train,train_predictions,y_test,final_predictions)
cm = confusion_matrix(y_test,final_predictions, labels= final_model['logisticregression'].classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels= final_model['logisticregression'].classes_)
sns.set(font_scale=1.5) # Adjust to fit
disp.plot()
plt.gca().grid(False)
plt.show()
# Inspecting the pipeline objects
final_model.steps
final_model['columntransformer'].named_transformers_['num'].get_params()
#https://stackoverflow.com/questions/67374844/how-to-find-out-standardscaling-parameters-mean-and-scale-when-using-column
final_model['columntransformer'].named_transformers_['num'].named_steps['std_scaler'].__getstate__()
import joblib
joblib.dump(final_model, 'credit-screening-lr.pkl')
clf = joblib.load('credit-screening-lr.pkl')
clf
print(list(dir(clf.named_steps['logisticregression'])))
clf.named_steps['logisticregression'].coef_