Commit f2b8c184 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

split feature making and target making

parent eefc6126
Pipeline #67099 failed with stage
in 3 minutes and 12 seconds
%% Cell type:code id: tags:
```
``` python
import numpy as np
import pandas as pd
import paths as pt
from tools import file_reader, preprocessor, neural_embedder
from utility import metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
CASE = "Complete"
FILENAME = "complete.csv"
class NetworkCategory:
def __init__(self, alias: str, unique_values: int):
self.alias = alias
self.unique_values = unique_values
self.embedding_size = self.get_embedding_size(unique_values)
def get_embedding_size(self, unique_values: int) -> int:
size = int(min(np.ceil(unique_values / 2), 50))
if size < 2:
return 2
else:
return size
def transpose_to_list(X):
features_list = []
for index in range(X.shape[1]):
features_list.append(X[..., [index]])
return features_list
def ginic(actual, pred):
n = len(actual)
a_s = actual[np.argsort(pred)]
a_c = a_s.cumsum()
giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
return giniSum / n
def gini_normalizedc(a, p):
return ginic(a, p) / ginic(a, a)
def get_categorial_cols(df, target_name):
cat_list = []
for category in df:
if not category == target_name and is_string_dtype(df[category]):
cat_list.append(NetworkCategory(category, df[category].nunique()))
return cat_list
def get_numerical_cols(df, target_name):
num_list = []
for category in df:
if not category == target_name and is_numeric_dtype(df[category]):
num_list.append(category)
return num_list
def build_embedding_network(cat_cols, num_cols):
# Make numerical layers
numerical_inputs = []
numerical_outputs = []
for category in num_cols:
input_category = tf.keras.layers.Input(shape=(1,))
output_category = tf.keras.layers.Dense(1, name=category)(input_category)
numerical_inputs.append(input_category)
numerical_outputs.append(output_category)
# Make embedding layers
embedding_inputs = []
embedding_outputs = []
for category in cat_cols:
input_category = tf.keras.layers.Input(shape=(1,))
output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,
output_dim=category.embedding_size,
name=category.alias)(input_category)
output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)
embedding_inputs.append(input_category)
embedding_outputs.append(output_category)
# Concatenate layers
model_inputs = numerical_inputs + embedding_inputs
model_outputs = numerical_outputs + embedding_outputs
# Make hidden layers
output_model = tf.keras.layers.Concatenate()(model_outputs)
layer_sizes = [80, 20, 10]
dropout_rates = [.35, .15, .15]
for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):
output_model = tf.keras.layers.Dense(layer_size)(output_model)
output_model = tf.keras.layers.Activation("relu")(output_model)
output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)
# Make final layer
output_model = tf.keras.layers.Dense(1)(output_model)
output_model = tf.keras.layers.Activation('sigmoid')(output_model)
metrics = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc'),
]
model = tf.keras.Model(inputs=model_inputs, outputs=output_model)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)
return model
```
%% Cell type:code id: tags:
```
``` python
ats_cols = {str(i)+'Ats':str for i in range(1, 10+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
FILENAME,
converters=ats_cols)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
# Collect embedded and numerical cols
cat_cols = get_categorial_cols(df, CASE)
num_cols = get_numerical_cols(df, CASE)
# Prepare the data
X, y = preprocessor.get_X_y(df, CASE)
X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])
X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])
X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)
X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)
# Network training
model = build_embedding_network(cat_cols, num_cols)
model.fit(transpose_to_list(X), y, epochs=10, batch_size=32, verbose=False)
```
%% Output
<tensorflow.python.keras.callbacks.History at 0x1d90e26a3a0>
%% Cell type:code id: tags:
```
``` python
def prob(data):
global model
y_pred = model.predict(transpose_to_list(data)).reshape(-1, 1)
y_pred = (y_pred>0.5)
print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))
return np.hstack((1-y_pred,y_pred))
import lime
import lime.lime_tabular
features = list(df.columns)
features.remove('Complete')
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',
class_names=['No complete', 'Complete'],
feature_names=features)