Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
a8bf8bd5
Commit
a8bf8bd5
authored
Nov 24, 2021
by
thecml
Browse files
did some cleaning, added cv surv
parent
60e89abf
Pipeline
#95103
passed with stage
in 4 minutes and 36 seconds
Changes
19
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
ml/models/alarm_rsf.joblib
View file @
a8bf8bd5
No preview for this file type
ml/notebooks/ATS_make_names.ipynb
deleted
100644 → 0
View file @
60e89abf
{
"cells": [
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import config as cfg\n",
"from tools import file_reader\n",
"from pathlib import Path\n",
"\n",
"df = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl')\n",
"df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])\n",
"df = df.drop_duplicates(subset=['DevISOClass'])\n",
"df = df[['DevHMIName', 'DevISOClass']]\n",
"df = df.sort_values('DevISOClass')\n",
"columns_titles = [\"DevISOClass\",\"DevHMIName\"]\n",
"df = df.reindex(columns=columns_titles)\n",
"df = df.rename(columns={\"DevISOClass\": \"ats_id\", \"DevHMIName\": \"ats_name\"})\n",
"\n",
"ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',\n",
" converters={'ats_id': str})\n",
"\n",
"df = ats.merge(df, how='outer', on=['ats_id']).drop_duplicates(['ats_id'], keep='first')\n",
"\n",
"file_name = f\"ats full.csv\"\n",
"df.to_csv(Path.joinpath(cfg.REFERENCES_DIR, file_name), index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('py38-air': conda)",
"name": "python388jvsc74a57bd059ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
```
import pandas as pd
import config as cfg
from tools import file_reader
from pathlib import Path
df = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl')
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.drop_duplicates(subset=['DevISOClass'])
df = df[['DevHMIName', 'DevISOClass']]
df = df.sort_values('DevISOClass')
columns_titles = ["DevISOClass","DevHMIName"]
df = df.reindex(columns=columns_titles)
df = df.rename(columns={"DevISOClass": "ats_id", "DevHMIName": "ats_name"})
ats = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = ats.merge(df, how='outer', on=['ats_id']).drop_duplicates(['ats_id'], keep='first')
file_name = f"ats full.csv"
df.to_csv(Path.joinpath(cfg.REFERENCES_DIR, file_name), index=False)
```
ml/notebooks/ATS_mapping.ipynb
deleted
100644 → 0
View file @
60e89abf
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>CitizenId</th>\n <th>Gender</th>\n <th>BirthYear</th>\n <th>DevHMINumber</th>\n <th>DevHMIName</th>\n <th>DevISOClass</th>\n <th>DevSerial</th>\n <th>LawParagraph</th>\n <th>LendDate</th>\n <th>ReturnDate</th>\n <th>Price</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1002012383</td>\n <td>MALE</td>\n <td>0</td>\n <td>800026</td>\n <td>5501 Hjørnestol 1-3 år</td>\n <td>SpecielleSiddemøbler</td>\n <td>800026-000017</td>\n <td>97</td>\n <td>2000-12-19</td>\n <td>2001-11-26</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1002012383</td>\n <td>MALE</td>\n <td>0</td>\n <td>800027</td>\n <td>5502 Bord til hjørnestol 1-3 år</td>\n <td>SpecielleSiddemøbler</td>\n <td>800027-000003</td>\n <td>0</td>\n <td>2000-12-19</td>\n <td>2001-11-26</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1002012383</td>\n <td>MALE</td>\n <td>0</td>\n <td>800278</td>\n <td>Hynder/puder til hjørnestole</td>\n <td>SpecielleSiddemøbler</td>\n <td>800278-000011</td>\n <td>0</td>\n <td>2000-12-19</td>\n <td>2001-11-26</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1002012383</td>\n <td>MALE</td>\n <td>0</td>\n <td>800174</td>\n <td>Nakkestøtte m. pude til hjørnestol.</td>\n <td>SpecielleSiddemøbler</td>\n <td>800174-000005</td>\n <td>0</td>\n <td>2000-12-19</td>\n <td>2001-11-26</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1002012383</td>\n <td>MALE</td>\n <td>0</td>\n <td>800027</td>\n <td>5502 Bord til hjørnestol 1-3 år</td>\n <td>SpecielleSiddemøbler</td>\n <td>800027-000005</td>\n <td>0</td>\n <td>2001-01-11</td>\n <td>2001-11-26</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>311708</th>\n <td>825965067</td>\n <td>MALE</td>\n <td>98</td>\n <td>42273</td>\n <td>Albuestok med blødt standard håndtag, med clips</td>\n <td>Albuestokke</td>\n <td>042273-000612</td>\n <td>112</td>\n <td>2019-11-13</td>\n <td>NaT</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>311709</th>\n <td>825965067</td>\n <td>MALE</td>\n <td>98</td>\n <td>42273</td>\n <td>Albuestok med blødt standard håndtag, med clips</td>\n <td>Albuestokke</td>\n <td>042273-000613</td>\n <td>112</td>\n <td>2019-11-13</td>\n <td>NaT</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>311710</th>\n <td>825965067</td>\n <td>MALE</td>\n <td>98</td>\n <td>101101</td>\n <td>HAWK, SB 40 cm</td>\n <td>KørestoleManuelleDrivringe</td>\n <td>101101-000003</td>\n <td>112</td>\n <td>2019-12-09</td>\n <td>NaT</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>311711</th>\n <td>825965067</td>\n <td>MALE</td>\n <td>98</td>\n <td>89463</td>\n <td>Wing Viscoflex Plus, SB 40 x SD 40 cm, SH 8 cm</td>\n <td>TryksårsforebyggendeSidde</td>\n <td>089463-000011</td>\n <td>112</td>\n <td>2019-12-09</td>\n <td>2020-08-17</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>311712</th>\n <td>825965067</td>\n <td>MALE</td>\n <td>98</td>\n <td>31353</td>\n <td>AD Stimulite Classic siddepude, 41x41x7 cm</td>\n <td>TryksårsforebyggendeSidde</td>\n <td>031353-000002</td>\n <td>112</td>\n <td>2020-05-05</td>\n <td>NaT</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>\n<p>311713 rows × 11 columns</p>\n</div>",
"text/plain": " CitizenId Gender BirthYear DevHMINumber \\\n0 1002012383 MALE 0 800026 \n1 1002012383 MALE 0 800027 \n2 1002012383 MALE 0 800278 \n3 1002012383 MALE 0 800174 \n4 1002012383 MALE 0 800027 \n... ... ... ... ... \n311708 825965067 MALE 98 42273 \n311709 825965067 MALE 98 42273 \n311710 825965067 MALE 98 101101 \n311711 825965067 MALE 98 89463 \n311712 825965067 MALE 98 31353 \n\n DevHMIName \\\n0 5501 Hjørnestol 1-3 år \n1 5502 Bord til hjørnestol 1-3 år \n2 Hynder/puder til hjørnestole \n3 Nakkestøtte m. pude til hjørnestol. \n4 5502 Bord til hjørnestol 1-3 år \n... ... \n311708 Albuestok med blødt standard håndtag, med clips \n311709 Albuestok med blødt standard håndtag, med clips \n311710 HAWK, SB 40 cm \n311711 Wing Viscoflex Plus, SB 40 x SD 40 cm, SH 8 cm \n311712 AD Stimulite Classic siddepude, 41x41x7 cm \n\n DevISOClass DevSerial LawParagraph LendDate \\\n0 SpecielleSiddemøbler 800026-000017 97 2000-12-19 \n1 SpecielleSiddemøbler 800027-000003 0 2000-12-19 \n2 SpecielleSiddemøbler 800278-000011 0 2000-12-19 \n3 SpecielleSiddemøbler 800174-000005 0 2000-12-19 \n4 SpecielleSiddemøbler 800027-000005 0 2001-01-11 \n... ... ... ... ... \n311708 Albuestokke 042273-000612 112 2019-11-13 \n311709 Albuestokke 042273-000613 112 2019-11-13 \n311710 KørestoleManuelleDrivringe 101101-000003 112 2019-12-09 \n311711 TryksårsforebyggendeSidde 089463-000011 112 2019-12-09 \n311712 TryksårsforebyggendeSidde 031353-000002 112 2020-05-05 \n\n ReturnDate Price \n0 2001-11-26 0.0 \n1 2001-11-26 0.0 \n2 2001-11-26 0.0 \n3 2001-11-26 0.0 \n4 2001-11-26 0.0 \n... ... ... \n311708 NaT 0.0 \n311709 NaT 0.0 \n311710 NaT 0.0 \n311711 2020-08-17 0.0 \n311712 NaT 0.0 \n\n[311713 rows x 11 columns]"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\r\n",
"import config as cfg\r\n",
"from tools import file_reader\r\n",
"from pathlib import Path\r\n",
"\r\n",
"df = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl').reset_index(drop=True)\r\n",
"mapping = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',\r\n",
" converters={'ats_id': str})\r\n",
"df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])\r\n",
"df = df.dropna(subset=['CitizenId'])\r\n",
"mapping_dict = dict(mapping.values)\r\n",
"df = df.replace(to_replace=mapping_dict)\r\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('py38-air': conda)",
"name": "python388jvsc74a57bd059ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
},
"language_info": {
"name": "python",
"version": ""
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
%% Cell type:code id: tags:
```
import pandas as pd
import config as cfg
from tools import file_reader
from pathlib import Path
df = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl').reset_index(drop=True)
mapping = file_reader.read_csv(cfg.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.dropna(subset=['CitizenId'])
mapping_dict = dict(mapping.values)
df = df.replace(to_replace=mapping_dict)
df
```
%% Output
CitizenId Gender BirthYear DevHMINumber \\n0 1002012383 MALE 0 800026 \n1 1002012383 MALE 0 800027 \n2 1002012383 MALE 0 800278 \n3 1002012383 MALE 0 800174 \n4 1002012383 MALE 0 800027 \n... ... ... ... ... \n311708 825965067 MALE 98 42273 \n311709 825965067 MALE 98 42273 \n311710 825965067 MALE 98 101101 \n311711 825965067 MALE 98 89463 \n311712 825965067 MALE 98 31353 \n\n DevHMIName \\n0 5501 Hjørnestol 1-3 år \n1 5502 Bord til hjørnestol 1-3 år \n2 Hynder/puder til hjørnestole \n3 Nakkestøtte m. pude til hjørnestol. \n4 5502 Bord til hjørnestol 1-3 år \n... ... \n311708 Albuestok med blødt standard håndtag, med clips \n311709 Albuestok med blødt standard håndtag, med clips \n311710 HAWK, SB 40 cm \n311711 Wing Viscoflex Plus, SB 40 x SD 40 cm, SH 8 cm \n311712 AD Stimulite Classic siddepude, 41x41x7 cm \n\n DevISOClass DevSerial LawParagraph LendDate \\n0 SpecielleSiddemøbler 800026-000017 97 2000-12-19 \n1 SpecielleSiddemøbler 800027-000003 0 2000-12-19 \n2 SpecielleSiddemøbler 800278-000011 0 2000-12-19 \n3 SpecielleSiddemøbler 800174-000005 0 2000-12-19 \n4 SpecielleSiddemøbler 800027-000005 0 2001-01-11 \n... ... ... ... ... \n311708 Albuestokke 042273-000612 112 2019-11-13 \n311709 Albuestokke 042273-000613 112 2019-11-13 \n311710 KørestoleManuelleDrivringe 101101-000003 112 2019-12-09 \n311711 TryksårsforebyggendeSidde 089463-000011 112 2019-12-09 \n311712 TryksårsforebyggendeSidde 031353-000002 112 2020-05-05 \n\n ReturnDate Price \n0 2001-11-26 0.0 \n1 2001-11-26 0.0 \n2 2001-11-26 0.0 \n3 2001-11-26 0.0 \n4 2001-11-26 0.0 \n... ... ... \n311708 NaT 0.0 \n311709 NaT 0.0 \n311710 NaT 0.0 \n311711 2020-08-17 0.0 \n311712 NaT 0.0 \n\n[311713 rows x 11 columns]
ml/notebooks/MLP_emb_lime.ipynb
deleted
100644 → 0
View file @
60e89abf
{
"cells": [
{
"cell_type": "code",
"execution_count": 304,
"source": [
"import numpy as np\r\n",
"import pandas as pd\r\n",
"import paths as pt\r\n",
"from tools import file_reader, preprocessor, neural_embedder, data_loader\r\n",
"from utility import metrics\r\n",
"from sklearn.metrics import accuracy_score, precision_score\r\n",
"from sklearn.metrics import recall_score, roc_auc_score\r\n",
"from sklearn.model_selection import StratifiedKFold\r\n",
"from sklearn.model_selection import train_test_split\r\n",
"from pandas.api.types import is_string_dtype, is_numeric_dtype\r\n",
"from sklearn.preprocessing import StandardScaler\r\n",
"import tensorflow as tf\r\n",
"from pathlib import Path\r\n",
"import paths as pt\r\n",
"import yaml\r\n",
"\r\n",
"ATS_RESOLUTION = 10\r\n",
"\r\n",
"class NetworkCategory:\r\n",
" def __init__(self, alias: str, unique_values: int):\r\n",
" self.alias = alias\r\n",
" self.unique_values = unique_values\r\n",
" self.embedding_size = self.get_embedding_size(unique_values)\r\n",
" \r\n",
" def get_embedding_size(self, unique_values: int) -> int:\r\n",
" size = int(min(np.ceil(unique_values / 2), 50))\r\n",
" if size < 2:\r\n",
" return 2\r\n",
" else:\r\n",
" return size\r\n",
"\r\n",
"def transpose_to_list(X):\r\n",
" features_list = []\r\n",
" for index in range(X.shape[1]):\r\n",
" features_list.append(X[..., [index]])\r\n",
"\r\n",
" return features_list\r\n",
"\r\n",
"def ginic(actual, pred):\r\n",
" n = len(actual)\r\n",
" a_s = actual[np.argsort(pred)]\r\n",
" a_c = a_s.cumsum()\r\n",
" giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0\r\n",
" return giniSum / n\r\n",
" \r\n",
"def gini_normalizedc(a, p):\r\n",
" return ginic(a, p) / ginic(a, a)\r\n",
"\r\n",
"def get_categorial_cols(df, target_name):\r\n",
" cat_list = []\r\n",
" for category in df:\r\n",
" if not category == target_name and is_string_dtype(df[category]):\r\n",
" cat_list.append(NetworkCategory(category, df[category].nunique()))\r\n",
" return cat_list\r\n",
"\r\n",
"def get_numerical_cols(df, target_name):\r\n",
" num_list = []\r\n",
" for category in df:\r\n",
" if not category == target_name and is_numeric_dtype(df[category]):\r\n",
" num_list.append(category)\r\n",
" return num_list\r\n",
"\r\n",
"def build_embedding_network(cat_cols, num_cols): \r\n",
" # Make numerical layers\r\n",
" numerical_inputs = []\r\n",
" numerical_outputs = []\r\n",
" \r\n",
" for category in num_cols:\r\n",
" input_category = tf.keras.layers.Input(shape=(1,))\r\n",
" output_category = tf.keras.layers.Dense(1, name=category)(input_category)\r\n",
" \r\n",
" numerical_inputs.append(input_category)\r\n",
" numerical_outputs.append(output_category)\r\n",
" \r\n",
" # Make embedding layers\r\n",
" embedding_inputs = []\r\n",
" embedding_outputs = []\r\n",
" \r\n",
" for category in cat_cols:\r\n",
" input_category = tf.keras.layers.Input(shape=(1,))\r\n",
" output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,\r\n",
" output_dim=category.embedding_size,\r\n",
" name=category.alias)(input_category)\r\n",
" output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)\r\n",
"\r\n",
" embedding_inputs.append(input_category)\r\n",
" embedding_outputs.append(output_category)\r\n",
" \r\n",
" # Concatenate layers\r\n",
" model_inputs = numerical_inputs + embedding_inputs\r\n",
" model_outputs = numerical_outputs + embedding_outputs\r\n",
" \r\n",
" # Make hidden layers\r\n",
" output_model = tf.keras.layers.Concatenate()(model_outputs)\r\n",
" layer_sizes = [80, 20, 10]\r\n",
" dropout_rates = [.35, .15, .15]\r\n",
" for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):\r\n",
" output_model = tf.keras.layers.Dense(layer_size)(output_model)\r\n",
" output_model = tf.keras.layers.Activation(\"relu\")(output_model)\r\n",
" output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)\r\n",
" \r\n",
" # Make final layer\r\n",
" output_model = tf.keras.layers.Dense(1)(output_model)\r\n",
" output_model = tf.keras.layers.Activation('sigmoid')(output_model)\r\n",
"\r\n",
" metrics = [\r\n",
" tf.keras.metrics.BinaryAccuracy(name='accuracy'),\r\n",
" tf.keras.metrics.Precision(name='precision'),\r\n",
" tf.keras.metrics.Recall(name='recall'),\r\n",
" tf.keras.metrics.AUC(name='roc_auc'),\r\n",
" tf.keras.metrics.AUC(name='pr_auc', curve='PR')\r\n",
" ]\r\n",
" model = tf.keras.Model(inputs=model_inputs, outputs=output_model)\r\n",
" model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=metrics)\r\n",
" return model\r\n",
" \r\n",
"# Load the data\r\n",
"file_name = 'complete.csv'\r\n",
"ats_cols = {str(i)+'Ats':str for i in range(1, ATS_RESOLUTION+1)}\r\n",
"df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, file_name, converters=ats_cols)\r\n",
"\r\n",
"# Get number of numerical cols\r\n",
"emb_cols = df.filter(regex='((\\d+)[Ats])\\w+', axis=1)\r\n",
"n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1\r\n",
"\r\n",
"# Collect embedded and numerical cols\r\n",
"cat_cols = get_categorial_cols(df, 'Complete')\r\n",
"num_cols = get_numerical_cols(df, 'Complete')\r\n",
"\r\n",
"# Prepare the data\r\n",
"X, y = preprocessor.get_X_y(df, 'Complete')\r\n",
"X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)\r\n",
"y = np.array(y)\r\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7,\r\n",
" stratify=y, random_state=0)\r\n",
"\r\n",
"# Upsampling\r\n",
"pos = (pd.Series(y_train == 0))\r\n",
"X_train = np.concatenate((X_train, X_train[pos]), axis=0)\r\n",
"y_train = np.concatenate((y_train, y_train[pos]), axis=0)\r\n",
"\r\n",
"# Scaling\r\n",
"scaler = StandardScaler()\r\n",
"X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])\r\n",
"X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])\r\n",
"X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)\r\n",
"X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)\r\n",
"\r\n",
"# Network training\r\n",
"model = build_embedding_network(cat_cols, num_cols)\r\n",
"model.fit(transpose_to_list(X_train), y_train, epochs=10, batch_size=32, verbose=False)"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"<tensorflow.python.keras.callbacks.History at 0x250c3079040>"
]
},
"metadata": {},
"execution_count": 304
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 306,
"source": [
"results = model.evaluate(transpose_to_list(X_test), y_test)\r\n",
"print(f'Test loss: {results[0]} \\nTest accuracy: {results[1]}' +\r\n",
" f'\\nTest precision: {results[2]} \\nTest recall: {results[3]}' +\r\n",
" f'\\nTest roc_auc: {results[4]} \\nTest pr_auc: {results[5]}')"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"21/21 [==============================] - 1s 10ms/step - loss: 0.7547 - accuracy: 0.6211 - precision: 0.7664 - recall: 0.6803 - roc_auc: 0.5916 - pr_auc: 0.7656\n",
"Test loss: 0.7547047734260559 \n",
"Test accuracy: 0.6211180090904236\n",
"Test precision: 0.7664233446121216 \n",
"Test recall: 0.6803455948829651\n",
"Test roc_auc: 0.5915719270706177 \n",
"Test pr_auc: 0.7655594944953918\n"
]
}
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 307,
"source": [
"columns_to_skip = ['Gender_Male', 'Gender_Female', 'BirthYear',\r\n",
" 'Cluster', 'LoanPeriod', 'NumberAts'] + ['Complete']\r\n",
"embedded_weights = []\r\n",
"columns = df.loc[:, list(filter(lambda x: x not in columns_to_skip, df.columns))]\r\n",
"for col in columns:\r\n",
" weights = model.get_layer(col).get_weights()[0]\r\n",
" embedded_weights.append(weights)"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 308,
"source": [
"from sklearn.decomposition import PCA\r\n",
"\r\n",
"def strings_to_embeddings(x):\r\n",
" data = x.copy()\r\n",
" n_embedded_cols = emb_cols.shape[1]\r\n",
" cols = range(n_numerical_cols, n_numerical_cols+n_embedded_cols)\r\n",
" for i, col in enumerate(cols):\r\n",
" embeddings_column = embedded_weights[i]\r\n",
" pca = PCA(n_components=1)\r\n",
" y = np.concatenate(pca.fit_transform(embeddings_column))\r\n",
" encoder = labels[i]\r\n",
" mapping = dict(zip(encoder.classes_, y))\r\n",
" org_data = encoder.inverse_transform(data[:, col].astype(int)) \r\n",
" mapped_data = [mapping[data] for data in org_data]\r\n",
" data[:, col] = mapped_data\r\n",
" return data"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 313,
"source": [
"def predict(data):\r\n",
" global model, scaler\r\n",
" data[:,:n_numerical_cols] = scaler.transform(data[:,:n_numerical_cols])\r\n",
" data = strings_to_embeddings(data)\r\n",
" y_pred = model.predict(transpose_to_list(data)).reshape(-1, 1)\r\n",
" y_pred = (y_pred>0.5)\r\n",
" return np.hstack((1-y_pred,y_pred))\r\n",
"\r\n",
"import lime\r\n",
"import lime.lime_tabular\r\n",
"features = list(df.columns)\r\n",
"features.remove('Complete')\r\n",
"explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',\r\n",
" class_names=['No complete', 'Complete'],\r\n",
" feature_names=features)\r\n",
"exp = explainer.explain_instance(X_test[0], predict, num_features=X_train.shape[1])\r\n",
"exp.as_list()"
],
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[('Gender_Male <= -0.73', -0.5051485443991541),\n",
" ('-1.37 < Gender_Female <= 0.73', 0.24463139170246656),\n",
" ('-0.29 < NumberAts <= 0.38', 0.07051200836688602),\n",
" ('1.00 < 8Ats <= 20.25', 0.013587126766024115),\n",
" ('11.00 < 4Ats <= 24.00', 0.009975327212022503),\n",
" ('-0.73 < LoanPeriod <= -0.21', -0.007156671962724148),\n",
" ('1Ats <= 10.00', 0.006032846505298264),\n",
" ('2Ats > 24.00', -0.005847530037414186),\n",
" ('3Ats <= 6.00', 0.005389994671909197),\n",
" ('6.00 < 7Ats <= 23.00', 0.004772352035400859),\n",
" ('-0.19 < BirthYear <= 0.59', -0.003858202127112373),\n",
" ('0.00 < 5Ats <= 14.00', 0.0036927510709771558),\n",
" ('10Ats <= 0.00', 0.0015974481898791604),\n",
" ('-1.00 < Cluster <= -0.20', -0.0005204929245957747),\n",
" ('9Ats > 17.00', 0.0004451113292966627),\n",
" ('9.00 < 6Ats <= 26.00', -0.00013931021082525937)]"
]
},
"metadata": {},
"execution_count": 313
}
],
"metadata": {}
}
],
"metadata": {
"interpreter": {
"hash": "59ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3.8.8 64-bit ('py38-air': conda)"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
%% Cell type:code id: tags:
```
import
numpy
as
np
import
pandas
as
pd
import
paths
as
pt
from
tools
import
file_reader
,
preprocessor
,
neural_embedder
,
data_loader
from
utility
import
metrics
from
sklearn
.
metrics
import
accuracy_score
,
precision_score
from
sklearn
.
metrics
import
recall_score
,
roc_auc_score
from
sklearn
.
model_selection
import
StratifiedKFold
from
sklearn
.
model_selection
import
train_test_split
from
pandas
.
api
.
types
import
is_string_dtype
,
is_numeric_dtype
from
sklearn
.
preprocessing
import
StandardScaler
import
tensorflow
as
tf
from
pathlib
import
Path
import
paths
as
pt
import
yaml
ATS_RESOLUTION
=
10
class
NetworkCategory
:
def
__init__
(
self
,
alias
:
str
,
unique_values
:
int
):
self
.
alias
=
alias
self
.
unique_values
=
unique_values
self
.
embedding_size
=
self
.
get_embedding_size
(
unique_values
)
def
get_embedding_size
(
self
,
unique_values
:
int
)
->
int
:
size
=
int
(
min
(
np
.
ceil
(
unique_values
/
2
),
50
))
if
size
<
2
:
return
2
else
:
return
size
def
transpose_to_list
(
X
):
features_list
=
[]
for
index
in
range
(
X
.
shape
[
1
]):
features_list
.
append
(
X
[...,
[
index
]])
return
features_list
def
ginic
(
actual
,
pred
):
n
=
len
(
actual
)
a_s
=
actual
[
np
.
argsort
(
pred
)]
a_c
=
a_s
.
cumsum
()
giniSum
=
a_c
.
sum
()
/
a_c
[-
1
]
-
(
n
+
1
)
/
2.0
return
giniSum
/
n
def
gini_normalizedc
(
a
,
p
):
return
ginic
(
a
,
p
)
/
ginic
(
a
,
a
)
def
get_categorial_cols
(
df
,
target_name
):
cat_list
=
[]
for
category
in
df
:
if
not
category
==
target_name
and
is_string_dtype
(
df
[
category
]):
cat_list
.
append
(
NetworkCategory
(
category
,
df
[
category
].
nunique
()))
return
cat_list
def
get_numerical_cols
(
df
,
target_name
):
num_list
=
[]
for
category
in
df
:
if
not
category
==
target_name
and
is_numeric_dtype
(
df
[
category
]):
num_list
.
append
(
category
)
return
num_list
def
build_embedding_network
(
cat_cols
,
num_cols
):
#
Make
numerical
layers
numerical_inputs
=
[]
numerical_outputs
=
[]
for
category
in
num_cols
:
input_category
=
tf
.
keras
.
layers
.
Input
(
shape
=(
1
,))
output_category
=
tf
.
keras
.
layers
.
Dense
(
1
,
name
=
category
)(
input_category
)
numerical_inputs
.
append
(
input_category
)
numerical_outputs
.
append
(
output_category
)
#
Make
embedding
layers
embedding_inputs
=
[]
embedding_outputs
=
[]
for
category
in
cat_cols
:
input_category
=
tf
.
keras
.
layers
.
Input
(
shape
=(
1
,))
output_category
=
tf
.
keras
.
layers
.
Embedding
(
input_dim
=
category
.
unique_values
,
output_dim
=
category
.
embedding_size
,
name
=
category
.
alias
)(
input_category
)
output_category
=
tf
.
keras
.
layers
.
Reshape
(
target_shape
=(
category
.
embedding_size
,))(
output_category
)
embedding_inputs
.
append
(
input_category
)
embedding_outputs
.
append
(
output_category
)
#
Concatenate
layers
model_inputs
=
numerical_inputs
+
embedding_inputs
model_outputs
=
numerical_outputs
+
embedding_outputs
#
Make
hidden
layers
output_model
=
tf
.
keras
.
layers
.
Concatenate
()(
model_outputs
)
layer_sizes
=
[
80
,
20
,
10
]
dropout_rates
=
[
.35
,
.15
,
.15
]
for
layer_size
,
dropout_rate
in
zip
(
layer_sizes
,
dropout_rates
):
output_model
=
tf
.
keras
.
layers
.
Dense
(
layer_size
)(
output_model
)
output_model
=
tf
.
keras
.
layers
.
Activation
(
"relu"
)(
output_model
)
output_model
=
tf
.
keras
.
layers
.
Dropout
(
dropout_rate
)(
output_model
)
#
Make
final
layer
output_model
=
tf
.
keras
.
layers
.
Dense
(
1
)(
output_model
)
output_model
=
tf
.
keras
.
layers
.
Activation
(
'sigmoid'
)(
output_model
)
metrics
=
[
tf
.
keras
.
metrics
.
BinaryAccuracy
(
name
=
'accuracy'
),
tf
.
keras
.
metrics
.
Precision
(
name
=
'precision'
),
tf
.
keras
.
metrics
.
Recall
(
name
=
'recall'
),
tf
.
keras
.
metrics
.
AUC
(
name
=
'roc_auc'
),
tf
.
keras
.
metrics
.
AUC
(
name
=
'pr_auc'
,
curve
=
'PR'
)
]
model
=
tf
.
keras
.
Model
(
inputs
=
model_inputs
,
outputs
=
output_model
)
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'Adam'
,
metrics
=
metrics
)
return
model
#
Load
the
data
file_name
=
'complete.csv'
ats_cols
=
{
str
(
i
)+
'Ats'
:
str
for
i
in
range
(
1
,
ATS_RESOLUTION
+
1
)}
df
=
file_reader
.
read_csv
(
pt
.
PROCESSED_DATA_DIR
,
file_name
,
converters
=
ats_cols
)
#
Get
number
of
numerical
cols
emb_cols
=
df
.
filter
(
regex
=
'((\d+)[Ats])\w+'
,
axis
=
1
)
n_numerical_cols
=
df
.
shape
[
1
]
-
emb_cols
.
shape
[
1
]
-
1
#
Collect
embedded
and
numerical
cols
cat_cols
=
get_categorial_cols
(
df
,
'Complete'
)
num_cols
=
get_numerical_cols
(
df
,
'Complete'
)
#
Prepare
the
data
X
,
y
=
preprocessor
.
get_X_y
(
df
,
'Complete'
)
X
,
labels
=
preprocessor
.
encode_vector_label
(
X
,
n_numerical_cols
)
y
=
np
.
array
(
y
)
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
y
,
train_size
=
0.7
,
stratify
=
y
,
random_state
=
0
)
#
Upsampling
pos
=
(
pd
.
Series
(
y_train
==
0
))
X_train
=
np
.
concatenate
((
X_train
,
X_train
[
pos
]),
axis
=
0
)
y_train
=
np
.
concatenate
((
y_train
,
y_train
[
pos
]),
axis
=
0
)
#
Scaling
scaler
=
StandardScaler
()
X_train_sc
=
scaler
.
fit_transform
(
X_train
[:,:
n_numerical_cols
])
X_test_sc
=
scaler
.
transform
(
X_test
[:,:
n_numerical_cols
])
X_train
=
np
.
concatenate
([
X_train_sc
,
X_train
[:,
n_numerical_cols
:]],
axis
=
1
)
X_test
=
np
.
concatenate
([
X_test_sc
,
X_test
[:,
n_numerical_cols
:]],
axis
=
1
)
#
Network
training
model
=
build_embedding_network
(
cat_cols
,
num_cols
)
model
.
fit
(
transpose_to_list
(
X_train
),
y_train
,
epochs
=
10
,
batch_size
=
32
,
verbose
=
False
)
```
%% Output
<tensorflow.python.keras.callbacks.History at 0x250c3079040>