Commit a8bf8bd5 authored by thecml's avatar thecml
Browse files

did some cleaning, added cv surv

parent 60e89abf
Pipeline #95103 passed with stage
in 4 minutes and 36 seconds
%% Cell type:code id: tags:
```
import pandas as pd
import config as cfg
from tools import file_reader
from pathlib import Path
df = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl')
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.drop_duplicates(subset=['DevISOClass'])
df = df[['DevHMIName', 'DevISOClass']]
df = df.sort_values('DevISOClass')
columns_titles = ["DevISOClass","DevHMIName"]
df = df.reindex(columns=columns_titles)
df = df.rename(columns={"DevISOClass": "ats_id", "DevHMIName": "ats_name"})
ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = ats.merge(df, how='outer', on=['ats_id']).drop_duplicates(['ats_id'], keep='first')
file_name = f"ats full.csv"
df.to_csv(Path.joinpath(cfg.REFERENCES_DIR, file_name), index=False)
```
%% Cell type:code id: tags:
```
import pandas as pd
import config as cfg
from tools import file_reader
from pathlib import Path
df = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl').reset_index(drop=True)
mapping = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.dropna(subset=['CitizenId'])
mapping_dict = dict(mapping.values)
df = df.replace(to_replace=mapping_dict)
df
```
%% Output
CitizenId Gender BirthYear DevHMINumber \\n0 1002012383 MALE 0 800026 \n1 1002012383 MALE 0 800027 \n2 1002012383 MALE 0 800278 \n3 1002012383 MALE 0 800174 \n4 1002012383 MALE 0 800027 \n... ... ... ... ... \n311708 825965067 MALE 98 42273 \n311709 825965067 MALE 98 42273 \n311710 825965067 MALE 98 101101 \n311711 825965067 MALE 98 89463 \n311712 825965067 MALE 98 31353 \n\n DevHMIName \\n0 5501 Hjørnestol 1-3 år \n1 5502 Bord til hjørnestol 1-3 år \n2 Hynder/puder til hjørnestole \n3 Nakkestøtte m. pude til hjørnestol. \n4 5502 Bord til hjørnestol 1-3 år \n... ... \n311708 Albuestok med blødt standard håndtag, med clips \n311709 Albuestok med blødt standard håndtag, med clips \n311710 HAWK, SB 40 cm \n311711 Wing Viscoflex Plus, SB 40 x SD 40 cm, SH 8 cm \n311712 AD Stimulite Classic siddepude, 41x41x7 cm \n\n DevISOClass DevSerial LawParagraph LendDate \\n0 SpecielleSiddemøbler 800026-000017 97 2000-12-19 \n1 SpecielleSiddemøbler 800027-000003 0 2000-12-19 \n2 SpecielleSiddemøbler 800278-000011 0 2000-12-19 \n3 SpecielleSiddemøbler 800174-000005 0 2000-12-19 \n4 SpecielleSiddemøbler 800027-000005 0 2001-01-11 \n... ... ... ... ... \n311708 Albuestokke 042273-000612 112 2019-11-13 \n311709 Albuestokke 042273-000613 112 2019-11-13 \n311710 KørestoleManuelleDrivringe 101101-000003 112 2019-12-09 \n311711 TryksårsforebyggendeSidde 089463-000011 112 2019-12-09 \n311712 TryksårsforebyggendeSidde 031353-000002 112 2020-05-05 \n\n ReturnDate Price \n0 2001-11-26 0.0 \n1 2001-11-26 0.0 \n2 2001-11-26 0.0 \n3 2001-11-26 0.0 \n4 2001-11-26 0.0 \n... ... ... \n311708 NaT 0.0 \n311709 NaT 0.0 \n311710 NaT 0.0 \n311711 2020-08-17 0.0 \n311712 NaT 0.0 \n\n[311713 rows x 11 columns]
%% Cell type:code id: tags:
```
import numpy as np
import pandas as pd
import paths as pt
from tools import file_reader, preprocessor, neural_embedder, data_loader
from utility import metrics
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from pathlib import Path
import paths as pt
import yaml
ATS_RESOLUTION = 10
class NetworkCategory:
def __init__(self, alias: str, unique_values: int):
self.alias = alias
self.unique_values = unique_values
self.embedding_size = self.get_embedding_size(unique_values)
def get_embedding_size(self, unique_values: int) -> int:
size = int(min(np.ceil(unique_values / 2), 50))
if size < 2:
return 2
else:
return size
def transpose_to_list(X):
features_list = []
for index in range(X.shape[1]):
features_list.append(X[..., [index]])
return features_list
def ginic(actual, pred):
n = len(actual)
a_s = actual[np.argsort(pred)]
a_c = a_s.cumsum()
giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
return giniSum / n
def gini_normalizedc(a, p):
return ginic(a, p) / ginic(a, a)
def get_categorial_cols(df, target_name):
cat_list = []
for category in df:
if not category == target_name and is_string_dtype(df[category]):
cat_list.append(NetworkCategory(category, df[category].nunique()))
return cat_list
def get_numerical_cols(df, target_name):
num_list = []
for category in df:
if not category == target_name and is_numeric_dtype(df[category]):
num_list.append(category)
return num_list
def build_embedding_network(cat_cols, num_cols):
# Make numerical layers
numerical_inputs = []
numerical_outputs = []
for category in num_cols:
input_category = tf.keras.layers.Input(shape=(1,))
output_category = tf.keras.layers.Dense(1, name=category)(input_category)
numerical_inputs.append(input_category)
numerical_outputs.append(output_category)
# Make embedding layers
embedding_inputs = []
embedding_outputs = []
for category in cat_cols:
input_category = tf.keras.layers.Input(shape=(1,))
output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,
output_dim=category.embedding_size,
name=category.alias)(input_category)
output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)
embedding_inputs.append(input_category)
embedding_outputs.append(output_category)
# Concatenate layers
model_inputs = numerical_inputs + embedding_inputs
model_outputs = numerical_outputs + embedding_outputs
# Make hidden layers
output_model = tf.keras.layers.Concatenate()(model_outputs)
layer_sizes = [80, 20, 10]
dropout_rates = [.35, .15, .15]
for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):
output_model = tf.keras.layers.Dense(layer_size)(output_model)
output_model = tf.keras.layers.Activation("relu")(output_model)
output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)
# Make final layer
output_model = tf.keras.layers.Dense(1)(output_model)
output_model = tf.keras.layers.Activation('sigmoid')(output_model)
metrics = [
tf.keras.metrics.BinaryAccuracy(name='accuracy'),
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='roc_auc'),
tf.keras.metrics.AUC(name='pr_auc', curve='PR')
]
model = tf.keras.Model(inputs=model_inputs, outputs=output_model)
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=metrics)
return model
# Load the data
file_name = 'complete.csv'
ats_cols = {str(i)+'Ats':str for i in range(1, ATS_RESOLUTION+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, file_name, converters=ats_cols)
# Get number of numerical cols
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
# Collect embedded and numerical cols
cat_cols = get_categorial_cols(df, 'Complete')
num_cols = get_numerical_cols(df, 'Complete')
# Prepare the data
X, y = preprocessor.get_X_y(df, 'Complete')
X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,
stratify=y, random_state=0)
# Upsampling
pos = (pd.Series(y_train == 0))
X_train = np.concatenate((X_train, X_train[pos]), axis=0)
y_train = np.concatenate((y_train, y_train[pos]), axis=0)
# Scaling
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])
X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])
X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)
X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)
# Network training
model = build_embedding_network(cat_cols, num_cols)
model.fit(transpose_to_list(X_train), y_train, epochs=10, batch_size=32, verbose=False)
```
%% Output
<tensorflow.python.keras.callbacks.History at 0x250c3079040>