Skip to content
Snippets Groups Projects
Commit e70443ac authored by thecml's avatar thecml
Browse files

enabled lime to explain embeddings

parent 80b08d82
No related branches found
No related tags found
No related merge requests found
Pipeline #79270 passed
This diff is collapsed.
%% Cell type:code id: tags:
```
import tensorflow as tf
import numpy as np
import pandas as pd
from pathlib import Path
import paths as pt
import yaml
from tools import data_loader, preprocessor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def make_model(input_dim):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(80,
input_dim=input_dim,
activation='relu'))
model.add(tf.keras.layers.Dropout(0.35))
model.add(tf.keras.layers.Dense(20, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dropout(0.15))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
metrics = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
]
model.compile(loss='binary_crossentropy',
optimizer="Adam",
metrics=metrics)
return model
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
with open(Path.joinpath(pt.CONFIGS_DIR, "complete_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
# Load the data
file_name = "complete_emb.csv"
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
features = dl.get_features()
X, y = dl.get_data()
# Calculate class weight
neg, pos = np.bincount(y)
class_weight = preprocessor.get_class_weight(neg, pos)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
stratify=y, random_state=0)
# Scale the data
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
numeric_feats = ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
scaler = StandardScaler()
X_train_enc = pd.DataFrame(scaler.fit_transform(X_train[numeric_feats]),
columns=numeric_feats)
X_test_enc = pd.DataFrame(scaler.transform(X_test[numeric_feats]),
columns=numeric_feats)
num_X_train = X_train.drop(numeric_feats, axis=1)
num_X_test = X_test.drop(numeric_feats, axis=1)
X_train_sc = pd.concat([num_X_train[['Gender_Male', 'Gender_Female']], X_train_enc,
num_X_train[['Rand', '1Ats', '2Ats', '3Ats', '4Ats', '5Ats',
'6Ats', '7Ats', '8Ats', '9Ats', '10Ats']]], axis=1)
X_test_sc = pd.concat([num_X_test[['Gender_Male', 'Gender_Female']], X_test_enc,
num_X_test[['Rand', '1Ats', '2Ats', '3Ats', '4Ats', '5Ats',
'6Ats', '7Ats', '8Ats', '9Ats', '10Ats']]], axis=1)
```
%% Cell type:code id: tags:
```
X_train = np.array(X_train)
X_train_sc = np.array(X_train_sc)
X_test = np.array(X_test)
X_test_sc = np.array(X_test_sc)
y_train = np.array(y_train)
y_test = np.array(y_test)
# Train the model
model = make_model(input_dim=X_train_sc.shape[1])
history = model.fit(X_train_sc, y_train, epochs=10,
class_weight=class_weight,
batch_size=32, verbose=False)
batch_size=32, verbose=True)
```
%% Output
WARNING:tensorflow:From C:\Users\cml\miniconda3\envs\py38-air\lib\site-packages\tensorflow\python\ops\array_ops.py:5043: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version.
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/10
47/47 [==============================] - 1s 7ms/step - loss: 0.6353 - accuracy: 0.6820 - precision: 0.7183 - recall: 0.9185 - auc: 0.5047
Epoch 2/10
47/47 [==============================] - 0s 7ms/step - loss: 0.6051 - accuracy: 0.7153 - precision: 0.7196 - recall: 0.9907 - auc: 0.5196
Epoch 3/10
47/47 [==============================] - 0s 6ms/step - loss: 0.5968 - accuracy: 0.7187 - precision: 0.7196 - recall: 0.9981 - auc: 0.5452
Epoch 4/10
47/47 [==============================] - 0s 6ms/step - loss: 0.5980 - accuracy: 0.7207 - precision: 0.7208 - recall: 0.9991 - auc: 0.5367
Epoch 5/10
47/47 [==============================] - 0s 6ms/step - loss: 0.5860 - accuracy: 0.7180 - precision: 0.7194 - recall: 0.9972 - auc: 0.5813
Epoch 6/10
47/47 [==============================] - 0s 6ms/step - loss: 0.5718 - accuracy: 0.7200 - precision: 0.7203 - recall: 0.9991 - auc: 0.6321
Epoch 7/10
47/47 [==============================] - 0s 7ms/step - loss: 0.5773 - accuracy: 0.7193 - precision: 0.7198 - recall: 0.9991 - auc: 0.6152
Epoch 8/10
47/47 [==============================] - 0s 7ms/step - loss: 0.5717 - accuracy: 0.7213 - precision: 0.7213 - recall: 0.9991 - auc: 0.6401
Epoch 9/10
47/47 [==============================] - 0s 7ms/step - loss: 0.5524 - accuracy: 0.7193 - precision: 0.7216 - recall: 0.9935 - auc: 0.6758
Epoch 10/10
47/47 [==============================] - 0s 7ms/step - loss: 0.5559 - accuracy: 0.7287 - precision: 0.7278 - recall: 0.9954 - auc: 0.6710
%% Cell type:code id: tags:
```
def predict(data):
global model, scaler
data[:,2:6] = scaler.transform(data[:,2:6])
y_pred = model.predict(data).reshape(-1, 1)
y_pred = (y_pred>0.5)
print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))
return np.hstack((1-y_pred,y_pred))
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',
class_names=['No complete', 'Complete'],
feature_names=features)
exp = explainer.explain_instance(X_test[27], predict, num_features=X_train.shape[1])
```
%% Output
[[0 1]
[1 0]
[0 1]
...
[0 1]
[1 0]
[0 1]]
%% Cell type:code id: tags:
```
exp.as_list()
```
%% Output
[('10Ats > 0.03', 0.20651574242348936),
('3Ats <= -0.05', 0.192865743355257),
('1Ats > 0.07', 0.18353986581301904),
('2Ats > -0.03', -0.18296575285910985),
('7Ats <= -0.02', -0.16874453649744492),
('9Ats <= -0.01', -0.13263677767888812),
('8Ats <= 0.06', -0.1273679310001168),
('6Ats > 0.08', 0.11597565308592392),
('5Ats <= -0.04', 0.11306311651048453),
('Gender_Female <= 0.00', -0.08515210039953246),
('8.00 < NumberAts <= 14.00', 0.06138616356339817),
('LoanPeriod > 1627.50', -0.05305334898011001),
('0.00 < Gender_Male <= 1.00', 0.05105843326784843),
('29.00 < BirthYear <= 35.00', 0.04509384864824174),
('Rand <= 0.25', 0.03759473681614202),
('0.03 < 4Ats <= 0.08', 0.0045078832741941325),
('Cluster > 11.00', 2.0051051251007468e-05)]
%% Cell type:code id: tags:
```
model.predict(np.array([X_test_sc[27],]))
```
%% Output
array([[0.59483784]], dtype=float32)
......
This diff is collapsed.
......@@ -14,6 +14,7 @@ import tensorflow as tf
CASE = "Complete"
FILENAME = "complete.csv"
ATS_RESOLUTION = 10
class NetworkCategory:
def __init__(self, alias: str, unique_values: int):
......@@ -106,14 +107,15 @@ def build_embedding_network(cat_cols, num_cols):
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
tf.keras.metrics.AUC(name='roc_auc'),
tf.keras.metrics.AUC(name='pr_auc', curve='PR')
]
model = tf.keras.Model(inputs=model_inputs, outputs=output_model)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=metrics)
return model
def main():
ats_cols = {str(i)+'Ats':str for i in range(1, pt.ATS_RESOLUTION+1)}
ats_cols = {str(i)+'Ats':str for i in range(1, ATS_RESOLUTION+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
FILENAME,
converters=ats_cols)
......@@ -198,5 +200,9 @@ def main():
print("Final recall: %.5f" % recall_score(y_test, y_scores_new))
print("Final rocauc: %.5f" % roc_auc_score(y_test, y_pred_final))
from sklearn.metrics import precision_recall_curve, auc
precision, recall, _ = precision_recall_curve(y_test, y_pred_final)
print("Final prauc: %.5f" % auc(recall, precision))
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment