Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
f2b8c184
Commit
f2b8c184
authored
Aug 18, 2021
by
Christian Marius Lillelund
Browse files
split feature making and target making
parent
eefc6126
Pipeline
#67099
failed with stage
in 3 minutes and 12 seconds
Changes
10
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
ml/models/fall_test/xgboost/.gitkeep
deleted
100644 → 0
View file @
eefc6126
ml/models/
fall_test
/.gitkeep
→
ml/models/
risk/embeddings
/.gitkeep
View file @
f2b8c184
File moved
ml/models/
fall_test/embeddings
/.gitkeep
→
ml/models/
risk/xgboost
/.gitkeep
View file @
f2b8c184
File moved
ml/notebooks/MLP_emb_lime.ipynb
View file @
f2b8c184
...
...
@@ -3,193 +3,177 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\
r\
n",
"import pandas as pd\
r\
n",
"import paths as pt\
r\
n",
"from tools import file_reader, preprocessor, neural_embedder\
r\
n",
"from utility import metrics\
r\
n",
"from sklearn.metrics import accuracy_score, precision_score\
r\
n",
"from sklearn.metrics import recall_score, roc_auc_score\
r\
n",
"from sklearn.model_selection import StratifiedKFold\
r\
n",
"from sklearn.model_selection import train_test_split\
r\
n",
"from pandas.api.types import is_string_dtype, is_numeric_dtype\
r\
n",
"from sklearn.preprocessing import StandardScaler\
r\
n",
"import tensorflow as tf\
r\
n",
"\
r\
n",
"CASE = \"Complete\"\
r\
n",
"FILENAME = \"complete.csv\"\
r\
n",
"\
r\
n",
"class NetworkCategory:\
r\
n",
" def __init__(self, alias: str, unique_values: int):\
r\
n",
" self.alias = alias\
r\
n",
" self.unique_values = unique_values\
r\
n",
" self.embedding_size = self.get_embedding_size(unique_values)\
r\
n",
"
\r
\n",
" def get_embedding_size(self, unique_values: int) -> int:\
r\
n",
" size = int(min(np.ceil(unique_values / 2), 50))\
r\
n",
" if size < 2:\
r\
n",
" return 2\
r\
n",
" else:\
r\
n",
" return size\
r\
n",
"\
r\
n",
"def transpose_to_list(X):\
r\
n",
" features_list = []\
r\
n",
" for index in range(X.shape[1]):\
r\
n",
" features_list.append(X[..., [index]])\
r\
n",
"\
r\
n",
" return features_list\
r\
n",
"\
r\
n",
"def ginic(actual, pred):\
r\
n",
" n = len(actual)\
r\
n",
" a_s = actual[np.argsort(pred)]\
r\
n",
" a_c = a_s.cumsum()\
r\
n",
" giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0\
r\
n",
" return giniSum / n\
r\
n",
"
\r
\n",
"def gini_normalizedc(a, p):\
r\
n",
" return ginic(a, p) / ginic(a, a)\
r\
n",
"\
r\
n",
"def get_categorial_cols(df, target_name):\
r\
n",
" cat_list = []\
r\
n",
" for category in df:\
r\
n",
" if not category == target_name and is_string_dtype(df[category]):\
r\
n",
" cat_list.append(NetworkCategory(category, df[category].nunique()))\
r\
n",
" return cat_list\
r\
n",
"\
r\
n",
"def get_numerical_cols(df, target_name):\
r\
n",
" num_list = []\
r\
n",
" for category in df:\
r\
n",
" if not category == target_name and is_numeric_dtype(df[category]):\
r\
n",
" num_list.append(category)\
r\
n",
" return num_list\
r\
n",
"\
r\
n",
"def build_embedding_network(cat_cols, num_cols):
\r
\n",
" # Make numerical layers\
r\
n",
" numerical_inputs = []\
r\
n",
" numerical_outputs = []\
r\
n",
"
\r
\n",
" for category in num_cols:\
r\
n",
" input_category = tf.keras.layers.Input(shape=(1,))\
r\
n",
" output_category = tf.keras.layers.Dense(1, name=category)(input_category)\
r\
n",
"
\r
\n",
" numerical_inputs.append(input_category)\
r\
n",
" numerical_outputs.append(output_category)\
r\
n",
"
\r
\n",
" # Make embedding layers\
r\
n",
" embedding_inputs = []\
r\
n",
" embedding_outputs = []\
r\
n",
"
\r
\n",
" for category in cat_cols:\
r\
n",
" input_category = tf.keras.layers.Input(shape=(1,))\
r\
n",
" output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,\
r\
n",
" output_dim=category.embedding_size,\
r\
n",
" name=category.alias)(input_category)\
r\
n",
" output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)\
r\
n",
"\
r\
n",
" embedding_inputs.append(input_category)\
r\
n",
" embedding_outputs.append(output_category)\
r\
n",
"
\r
\n",
" # Concatenate layers\
r\
n",
" model_inputs = numerical_inputs + embedding_inputs\
r\
n",
" model_outputs = numerical_outputs + embedding_outputs\
r\
n",
"
\r
\n",
" # Make hidden layers\
r\
n",
" output_model = tf.keras.layers.Concatenate()(model_outputs)\
r\
n",
" layer_sizes = [80, 20, 10]\
r\
n",
" dropout_rates = [.35, .15, .15]\
r\
n",
" for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):\
r\
n",
" output_model = tf.keras.layers.Dense(layer_size)(output_model)\
r\
n",
" output_model = tf.keras.layers.Activation(\"relu\")(output_model)\
r\
n",
" output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)\
r\
n",
"
\r
\n",
" # Make final layer\
r\
n",
" output_model = tf.keras.layers.Dense(1)(output_model)\
r\
n",
" output_model = tf.keras.layers.Activation('sigmoid')(output_model)\
r\
n",
"\
r\
n",
" metrics = [\
r\
n",
" tf.keras.metrics.BinaryAccuracy(name='accuracy'),\
r\
n",
" tf.keras.metrics.Precision(name='precision'),\
r\
n",
" tf.keras.metrics.Recall(name='recall'),\
r\
n",
" tf.keras.metrics.AUC(name='auc'),\
r\
n",
" ]\
r\
n",
" model = tf.keras.Model(inputs=model_inputs, outputs=output_model)\
r\
n",
" model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)\
r\
n",
"import numpy as np\n",
"import pandas as pd\n",
"import paths as pt\n",
"from tools import file_reader, preprocessor, neural_embedder\n",
"from utility import metrics\n",
"from sklearn.metrics import accuracy_score, precision_score\n",
"from sklearn.metrics import recall_score, roc_auc_score\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.model_selection import train_test_split\n",
"from pandas.api.types import is_string_dtype, is_numeric_dtype\n",
"from sklearn.preprocessing import StandardScaler\n",
"import tensorflow as tf\n",
"\n",
"CASE = \"Complete\"\n",
"FILENAME = \"complete.csv\"\n",
"\n",
"class NetworkCategory:\n",
" def __init__(self, alias: str, unique_values: int):\n",
" self.alias = alias\n",
" self.unique_values = unique_values\n",
" self.embedding_size = self.get_embedding_size(unique_values)\n",
" \n",
" def get_embedding_size(self, unique_values: int) -> int:\n",
" size = int(min(np.ceil(unique_values / 2), 50))\n",
" if size < 2:\n",
" return 2\n",
" else:\n",
" return size\n",
"\n",
"def transpose_to_list(X):\n",
" features_list = []\n",
" for index in range(X.shape[1]):\n",
" features_list.append(X[..., [index]])\n",
"\n",
" return features_list\n",
"\n",
"def ginic(actual, pred):\n",
" n = len(actual)\n",
" a_s = actual[np.argsort(pred)]\n",
" a_c = a_s.cumsum()\n",
" giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0\n",
" return giniSum / n\n",
" \n",
"def gini_normalizedc(a, p):\n",
" return ginic(a, p) / ginic(a, a)\n",
"\n",
"def get_categorial_cols(df, target_name):\n",
" cat_list = []\n",
" for category in df:\n",
" if not category == target_name and is_string_dtype(df[category]):\n",
" cat_list.append(NetworkCategory(category, df[category].nunique()))\n",
" return cat_list\n",
"\n",
"def get_numerical_cols(df, target_name):\n",
" num_list = []\n",
" for category in df:\n",
" if not category == target_name and is_numeric_dtype(df[category]):\n",
" num_list.append(category)\n",
" return num_list\n",
"\n",
"def build_embedding_network(cat_cols, num_cols): \n",
" # Make numerical layers\n",
" numerical_inputs = []\n",
" numerical_outputs = []\n",
" \n",
" for category in num_cols:\n",
" input_category = tf.keras.layers.Input(shape=(1,))\n",
" output_category = tf.keras.layers.Dense(1, name=category)(input_category)\n",
" \n",
" numerical_inputs.append(input_category)\n",
" numerical_outputs.append(output_category)\n",
" \n",
" # Make embedding layers\n",
" embedding_inputs = []\n",
" embedding_outputs = []\n",
" \n",
" for category in cat_cols:\n",
" input_category = tf.keras.layers.Input(shape=(1,))\n",
" output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,\n",
" output_dim=category.embedding_size,\n",
" name=category.alias)(input_category)\n",
" output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)\n",
"\n",
" embedding_inputs.append(input_category)\n",
" embedding_outputs.append(output_category)\n",
" \n",
" # Concatenate layers\n",
" model_inputs = numerical_inputs + embedding_inputs\n",
" model_outputs = numerical_outputs + embedding_outputs\n",
" \n",
" # Make hidden layers\n",
" output_model = tf.keras.layers.Concatenate()(model_outputs)\n",
" layer_sizes = [80, 20, 10]\n",
" dropout_rates = [.35, .15, .15]\n",
" for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):\n",
" output_model = tf.keras.layers.Dense(layer_size)(output_model)\n",
" output_model = tf.keras.layers.Activation(\"relu\")(output_model)\n",
" output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)\n",
" \n",
" # Make final layer\n",
" output_model = tf.keras.layers.Dense(1)(output_model)\n",
" output_model = tf.keras.layers.Activation('sigmoid')(output_model)\n",
"\n",
" metrics = [\n",
" tf.keras.metrics.BinaryAccuracy(name='accuracy'),\n",
" tf.keras.metrics.Precision(name='precision'),\n",
" tf.keras.metrics.Recall(name='recall'),\n",
" tf.keras.metrics.AUC(name='auc'),\n",
" ]\n",
" model = tf.keras.Model(inputs=model_inputs, outputs=output_model)\n",
" model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)\n",
" return model"
],
"outputs": [],
"metadata": {}
]
},
{
"cell_type": "code",
"execution_count": 2,
"source": [
"ats_cols = {str(i)+'Ats':str for i in range(1, 10+1)}\r\n",
"df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,\r\n",
" FILENAME,\r\n",
" converters=ats_cols)\r\n",
" \r\n",
"emb_cols = df.filter(regex='((\\d+)[Ats])\\w+', axis=1)\r\n",
"n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1\r\n",
"\r\n",
"# Collect embedded and numerical cols\r\n",
"cat_cols = get_categorial_cols(df, CASE)\r\n",
"num_cols = get_numerical_cols(df, CASE)\r\n",
"\r\n",
"# Prepare the data\r\n",
"X, y = preprocessor.get_X_y(df, CASE)\r\n",
"X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)\r\n",
"y = np.array(y)\r\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)\r\n",
"\r\n",
"scaler = StandardScaler()\r\n",
"X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])\r\n",
"X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])\r\n",
"X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)\r\n",
"X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)\r\n",
"\r\n",
"# Network training\r\n",
"model = build_embedding_network(cat_cols, num_cols)\r\n",
"model.fit(transpose_to_list(X), y, epochs=10, batch_size=32, verbose=False)"
],
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<tensorflow.python.keras.callbacks.History at 0x1d90e26a3a0>"
]
},
"execution_count": 2,
"metadata": {},
"
execution_count": 2
"
output_type": "execute_result"
}
],
"metadata": {}
"source": [
"ats_cols = {str(i)+'Ats':str for i in range(1, 10+1)}\n",
"df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,\n",
" FILENAME,\n",
" converters=ats_cols)\n",
" \n",
"emb_cols = df.filter(regex='((\\d+)[Ats])\\w+', axis=1)\n",
"n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1\n",
"\n",
"# Collect embedded and numerical cols\n",
"cat_cols = get_categorial_cols(df, CASE)\n",
"num_cols = get_numerical_cols(df, CASE)\n",
"\n",
"# Prepare the data\n",
"X, y = preprocessor.get_X_y(df, CASE)\n",
"X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)\n",
"y = np.array(y)\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)\n",
"\n",
"scaler = StandardScaler()\n",
"X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])\n",
"X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])\n",
"X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)\n",
"X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)\n",
"\n",
"# Network training\n",
"model = build_embedding_network(cat_cols, num_cols)\n",
"model.fit(transpose_to_list(X), y, epochs=10, batch_size=32, verbose=False)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"source": [
"def prob(data):\r\n",
" global model\r\n",
" y_pred = model.predict(transpose_to_list(data)).reshape(-1, 1)\r\n",
" y_pred = (y_pred>0.5)\r\n",
" print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))\r\n",
" return np.hstack((1-y_pred,y_pred))\r\n",
"\r\n",
"import lime\r\n",
"import lime.lime_tabular\r\n",
"features = list(df.columns)\r\n",
"features.remove('Complete')\r\n",
"explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',\r\n",
" class_names=['No complete', 'Complete'],\r\n",
" feature_names=features)\r\n",
"exp = explainer.explain_instance(X_test[27], prob, num_features=X_train.shape[1])"
],
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"[[0 1]\n",
" [0 1]\n",
...
...
@@ -201,37 +185,50 @@
]
}
],
"metadata": {}
"source": [
"def prob(data):\n",
" global model\n",
" y_pred = model.predict(transpose_to_list(data)).reshape(-1, 1)\n",
" y_pred = (y_pred>0.5)\n",
" print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))\n",
" return np.hstack((1-y_pred,y_pred))\n",
"\n",
"import lime\n",
"import lime.lime_tabular\n",
"features = list(df.columns)\n",
"features.remove('Complete')\n",
"explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',\n",
" class_names=['No complete', 'Complete'],\n",
" feature_names=features)\n",
"exp = explainer.explain_instance(X_test[27], prob, num_features=X_train.shape[1])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"source": [
"model.predict(transpose_to_list(np.array([X_test[27],])))"
],
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[0.60240066]], dtype=float32)"
]
},
"execution_count": 4,
"metadata": {},
"
execution_count": 4
"
output_type": "execute_result"
}
],
"metadata": {}
"source": [
"model.predict(transpose_to_list(np.array([X_test[27],])))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"source": [
"exp.as_list()"
],
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('-0.77 < Gender_Male <= 1.30', 0.0),\n",
...
...
@@ -252,35 +249,38 @@
" ('10Ats <= 0.00', 0.0)]"
]
},
"execution_count": 5,
"metadata": {},
"
execution_count": 5
"
output_type": "execute_result"
}
],
"metadata": {}
"source": [
"exp.as_list()"
]
}
],
"metadata": {
"orig_nbformat": 4,
"interpreter": {
"hash": "59ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.8",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.8 64-bit ('py38-air': conda)"
},
"interpreter": {
"hash": "59ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
}
%% Cell type:code id: tags:
```
```
python
import
numpy
as
np
import
pandas
as
pd
import
paths
as
pt
from
tools
import
file_reader
,
preprocessor
,
neural_embedder
from
utility
import
metrics
from
sklearn.metrics
import
accuracy_score
,
precision_score
from
sklearn.metrics
import
recall_score
,
roc_auc_score
from
sklearn.model_selection
import
StratifiedKFold
from
sklearn.model_selection
import
train_test_split
from
pandas.api.types
import
is_string_dtype
,
is_numeric_dtype
from
sklearn.preprocessing
import
StandardScaler
import
tensorflow
as
tf
CASE
=
"Complete"
FILENAME
=
"complete.csv"
class
NetworkCategory
:
def
__init__
(
self
,
alias
:
str
,
unique_values
:
int
):
self
.
alias
=
alias
self
.
unique_values
=
unique_values
self
.
embedding_size
=
self
.
get_embedding_size
(
unique_values
)
def
get_embedding_size
(
self
,
unique_values
:
int
)
->
int
:
size
=
int
(
min
(
np
.
ceil
(
unique_values
/
2
),
50
))
if
size
<
2
:
return
2
else
:
return
size
def
transpose_to_list
(
X
):
features_list
=
[]
for
index
in
range
(
X
.
shape
[
1
]):
features_list
.
append
(
X
[...,
[
index
]])
return
features_list
def
ginic
(
actual
,
pred
):
n
=
len
(
actual
)
a_s
=
actual
[
np
.
argsort
(
pred
)]
a_c
=
a_s
.
cumsum
()
giniSum
=
a_c
.
sum
()
/
a_c
[
-
1
]
-
(
n
+
1
)
/
2.0
return
giniSum
/
n
def
gini_normalizedc
(
a
,
p
):
return
ginic
(
a
,
p
)
/
ginic
(
a
,
a
)
def
get_categorial_cols
(
df
,
target_name
):
cat_list
=
[]
for
category
in
df
:
if
not
category
==
target_name
and
is_string_dtype
(
df
[
category
]):
cat_list
.
append
(
NetworkCategory
(
category
,
df
[
category
].
nunique
()))
return
cat_list
def
get_numerical_cols
(
df
,
target_name
):
num_list
=
[]
for
category
in
df
:
if
not
category
==
target_name
and
is_numeric_dtype
(
df
[
category
]):
num_list
.
append
(
category
)
return
num_list
def
build_embedding_network
(
cat_cols
,
num_cols
):
# Make numerical layers
numerical_inputs
=
[]
numerical_outputs
=
[]
for
category
in
num_cols
:
input_category
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
1
,))
output_category
=
tf
.
keras
.
layers
.
Dense
(
1
,
name
=
category
)(
input_category
)
numerical_inputs
.
append
(
input_category
)
numerical_outputs
.
append
(
output_category
)
# Make embedding layers
embedding_inputs
=
[]
embedding_outputs
=
[]
for
category
in
cat_cols
:
input_category
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
1
,))
output_category
=
tf
.
keras
.
layers
.
Embedding
(
input_dim
=
category
.
unique_values
,
output_dim
=
category
.
embedding_size
,
name
=
category
.
alias
)(
input_category
)
output_category
=
tf
.
keras
.
layers
.
Reshape
(
target_shape
=
(
category
.
embedding_size
,))(
output_category
)
embedding_inputs
.
append
(
input_category
)
embedding_outputs
.
append
(
output_category
)
# Concatenate layers
model_inputs
=
numerical_inputs
+
embedding_inputs
model_outputs
=
numerical_outputs
+
embedding_outputs
# Make hidden layers
output_model
=
tf
.
keras
.
layers
.
Concatenate
()(
model_outputs
)
layer_sizes
=
[
80
,
20
,
10
]
dropout_rates
=
[.
35
,
.
15
,
.
15
]
for
layer_size
,
dropout_rate
in
zip
(
layer_sizes
,
dropout_rates
):
output_model
=
tf
.
keras
.
layers
.
Dense
(
layer_size
)(
output_model
)
output_model
=
tf
.
keras
.
layers
.
Activation
(
"relu"
)(
output_model
)
output_model
=
tf
.
keras
.
layers
.
Dropout
(
dropout_rate
)(
output_model
)
# Make final layer
output_model
=
tf
.
keras
.
layers
.
Dense
(
1
)(
output_model
)
output_model
=
tf
.
keras
.
layers
.
Activation
(
'sigmoid'
)(
output_model
)
metrics
=
[
tf
.
keras
.
metrics
.
BinaryAccuracy
(
name
=
'accuracy'
),
tf
.
keras
.
metrics
.
Precision
(
name
=
'precision'
),
tf
.
keras
.
metrics
.
Recall
(
name
=
'recall'
),
tf
.
keras
.
metrics
.
AUC
(
name
=
'auc'
),
]
model
=
tf
.
keras
.
Model
(
inputs
=
model_inputs
,
outputs
=
output_model
)
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
metrics
)
return
model
```
%% Cell type:code id: tags:
```
```
python
ats_cols
=
{
str
(
i
)
+
'Ats'
:
str
for
i
in
range
(
1
,
10
+
1
)}
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
FILENAME
,
converters
=
ats_cols
)
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
# Collect embedded and numerical cols
cat_cols
=
get_categorial_cols
(
df
,
CASE
)
num_cols
=
get_numerical_cols
(
df
,
CASE
)
# Prepare the data
X
,
y
=
preprocessor
.
get_X_y
(
df
,
CASE
)
X
,
labels
=
preprocessor
.
encode_vector_label
(
X
,
n_numerical_cols
)
y
=
np
.
array
(
y
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
train_size
=
0.7
,
random_state
=
0
)
scaler
=
StandardScaler
()
X_train_sc
=
scaler
.
fit_transform
(
X_train
[:,:
n_numerical_cols
])
X_test_sc
=
scaler
.
transform
(
X_test
[:,:
n_numerical_cols
])
X_train
=
np
.
concatenate
([
X_train_sc
,
X_train
[:,
n_numerical_cols
:]],
axis
=
1
)
X_test
=
np
.
concatenate
([
X_test_sc
,
X_test
[:,
n_numerical_cols
:]],
axis
=
1
)
# Network training
model
=
build_embedding_network
(
cat_cols
,
num_cols
)
model
.
fit
(
transpose_to_list
(
X
),
y
,
epochs
=
10
,
batch_size
=
32
,
verbose
=
False
)
```
%% Output
<tensorflow.python.keras.callbacks.History at 0x1d90e26a3a0>
%% Cell type:code id: tags:
```
```
python
def
prob
(
data
):
global
model
y_pred
=
model
.
predict
(
transpose_to_list
(
data
)).
reshape
(
-
1
,
1
)
y_pred
=
(
y_pred
>
0.5
)
print
(
np
.
array
(
list
(
zip
(
1
-
y_pred
.
reshape
(
data
.
shape
[
0
]),
y_pred
.
reshape
(
data
.
shape
[
0
])))))
return
np
.
hstack
((
1
-
y_pred
,
y_pred
))
import
lime
import
lime.lime_tabular
features
=
list
(
df
.
columns
)
features
.
remove
(
'Complete'
)
explainer
=
lime
.
lime_tabular
.
LimeTabularExplainer
(
X_train
,
mode
=
'classification'
,
class_names
=
[
'No complete'
,
'Complete'
],