Commit aa322116 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved data scripts and named forth case risk

parent 21cb5577
......@@ -9,7 +9,7 @@ model_path: models/compliance/embeddings
train_ratio: 0.8
batch_size: 32
num_epochs: 20
num_epochs: 10
verbose: True
network_layers: [128]
optimizer: "Adam"
\ No newline at end of file
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Risk"
model_path: models/risk/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs_ats: 10
num_epochs_ex: 5
verbose: True
network_layers: [128]
optimizer: "Adam"
\ No newline at end of file
......@@ -9,6 +9,7 @@ threshold_weeks: 8
threshold_training: 10
fall_exercise_threshold: 3
fall_exercises: ['8058','8062','8066','8077','8074','8059','8071','8067']
risk_period_months: 6
# Settings for data loader -------------------------------------------------
#
......@@ -17,7 +18,19 @@ features_to_normalize: ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
features_to_scale: ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod', 'NumberAts']
# Settings for data script -------------------------------------------------
#
standard_features: ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod']
extended_features: ['Gender_Male', 'Gender_Female', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberFalls', 'NumberTraining',
'NumberTrainingWeek', 'TimeBetweenTraining',
'NumberWeeksNoTraining', 'Needs', 'Physics']
# Settings for dataset -------------------------------------------------
#
use_real_ats_names: True
use_real_ats_names: False
#!/usr/bin/env python
import numpy as np
import pandas as pd
import paths as pt
from tools import classifiers, data_loader
from tools import data_loader
from tools.classifiers import ClassifierResult
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import skew, boxcox
......@@ -12,19 +12,30 @@ from sklearn.preprocessing import RobustScaler, MaxAbsScaler, QuantileTransforme
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
import csv
from pathlib import Path
import paths as pt
import yaml
from typing import List
class DummyScaler(BaseEstimator, TransformerMixin):
def fit_transform(self, X):
return np.array(X)
class DummyNormalizer(BaseEstimator, TransformerMixin):
def fit_transform(self, X):
def fit_transform(self, X, case=None):
return np.array(X)
class BoxCoxNormalizer(BaseEstimator, TransformerMixin):
def fit_transform(self, X):
numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod', 'NumberAts']
def fit_transform(self, X, case=None):
if case == "Risk":
numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberFalls', 'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics',
'NumberAts', 'NumberEx']
else:
numeric_feats = ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod', 'NumberAts']
skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.25]
skewed_feats = skewed_feats.index
......@@ -34,8 +45,15 @@ class BoxCoxNormalizer(BaseEstimator, TransformerMixin):
return np.array(X)
class BoxCoxNormalizerNoGender(BaseEstimator, TransformerMixin):
def fit_transform(self, X):
numeric_feats = ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
def fit_transform(self, X, case=None):
if case == "Risk":
numeric_feats = ['BirthYear', 'Cluster',
'LoanPeriod', 'NumberSplit', 'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberFalls', 'NumberTraining', 'NumberTrainingWeek',
'TimeBetweenTraining', 'NumberWeeksNoTraining', 'Needs', 'Physics',
'NumberAts', 'NumberEx']
else:
numeric_feats = ['BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts']
skewed_feats = X[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.25]
skewed_feats = skewed_feats.index
......@@ -45,7 +63,10 @@ class BoxCoxNormalizerNoGender(BaseEstimator, TransformerMixin):
return np.array(X)
def main():
cases = ["Complete", "Compliance", "Fall"]
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
#cases = ["Complete", "Compliance", "Fall", "Risk"]
cases = ['Risk']
normalizer_names = ["None", "BoxCox", "BoxCoxNoGender"]
normalizers = [DummyNormalizer(), BoxCoxNormalizer(), BoxCoxNormalizerNoGender()]
scaler_names = ["None", "Standard", "MinMax", "MinMaxRange", "Robust",
......@@ -64,18 +85,21 @@ def main():
for normalizer_name, normalizer in zip(normalizer_names, normalizers):
for scaler_name, scaler in zip(scaler_names, scalers):
if case == "Complete":
dl = data_loader.CompleteDataLoader("complete_emb.csv").load_data()
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
dl = data_loader.ComplianceDataLoader("compliance_emb.csv").load_data()
dl = data_loader.ComplianceDataLoader("compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
dl = data_loader.FallDataLoader("fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
dl = data_loader.FallDataLoader("fall_emb.csv").load_data()
dl = data_loader.RiskDataLoader("risk_emb.csv", settings).load_data()
X, y = dl.get_data()
emb_cols = X.filter(regex='((\d+)[Ats])\w+', axis=1)
emb_cols = X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
n_norm_cols = X.shape[1] - emb_cols.shape[1]
X_sc = pd.DataFrame(normalizer.fit_transform(X.iloc[:,:n_norm_cols]))
X_sc = pd.DataFrame(normalizer.fit_transform(X.iloc[:,:n_norm_cols], case))
X = pd.concat([X_sc, X.iloc[:,n_norm_cols:]], axis=1)
X = np.array(X)
y = np.array(y)
......@@ -92,11 +116,11 @@ def main():
writer = csv.writer(f)
for clf_name, result in results.items():
data = [clf_name, normalizer_name, scaler_name,
round(np.mean(result[0])*100, 3),
round(np.mean(result[1])*100, 3),
round(np.mean(result[2])*100, 3),
round(np.mean(result[3])*100, 3),
round(np.mean(result[4])*100, 3)]
round(np.mean(result.accuracy)*100, 3),
round(np.mean(result.precision)*100, 3),
round(np.mean(result.recall)*100, 3),
round(np.mean(result.rocauc)*100, 3),
round(np.mean(result.prauc)*100, 3)]
writer.writerow(data)
if __name__ == '__main__':
......
......@@ -20,22 +20,35 @@ def main(ats_resolution: int = None):
if ats_resolution == None:
ats_resolution = settings['ats_resolution']
ex_resolution = settings['ex_resolution']
for target_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{target_name.lower()}.csv',
converters=ats)
# Make a df to be encoded
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
for target_name in ["Complete", "Compliance", "Fall", "Risk"]:
if target_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{target_name.lower()}.csv',
converters=ats)
else:
ex = {str(i)+'Ex':str for i in range(1, ex_resolution+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
converters = {**ex, **ats}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{target_name.lower()}.csv',
converters=converters)
if target_name in ["Complete", "Compliance", "Fall"]:
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
else:
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
ex_cols = [str(i)+'Ex' for i in range(1, ex_resolution+1)]
df_ats_to_enc = df.filter(regex=f'Risk|((\d+)[Ats])\w+', axis=1)
df_ex_to_enc = df.filter(regex=f'Risk|((\d+)[Ex])\w+', axis=1)
df = df.drop(ats_cols + ex_cols, axis=1)
# Remove old columns from original df
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
# Load embedded config
with open(Path.joinpath(pt.CONFIGS_DIR,
f"{target_name.lower()}_emb.yaml"), 'r') as stream:
......@@ -43,45 +56,46 @@ def main(ats_resolution: int = None):
# Encode dataframe given params
model_path = Path.joinpath(pt.ROOT_DIR, emb_cfg['model_path'])
df_enc = encode_dataframe(df=df_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs'],
optimizer=emb_cfg['optimizer'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
if target_name in ["Complete", "Compliance", "Fall"]:
df_enc = encode_dataframe(df=df_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs'],
optimizer=emb_cfg['optimizer'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
else:
ats_enc = encode_dataframe(df=df_ats_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs_ats'],
optimizer=emb_cfg['optimizer'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
ex_enc = encode_dataframe(df=df_ex_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs_ex'],
optimizer=emb_cfg['optimizer'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand']) # add random var
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{target_name.lower()}_emb.csv')
def make_fall_test_emb(ats_resolution):
ex = {str(i)+'Ex':str for i in range(1, pt.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
converters = {**ex, **ats}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'fall_test.csv',
converters=converters)
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
ex_cols = [str(i)+'Ex' for i in range(1, pt.EX_RESOLUTION+1)]
df_ats_to_enc = df.filter(regex=f'Fall|((\d+)[Ats])\w+', axis=1)
df_ats_to_enc = df_ats_to_enc.drop(['NumberFalls'], axis=1)
df_ex_to_enc = df.filter(regex=f'Fall|((\d+)[Ex])\w+', axis=1)
df_ex_to_enc = df_ex_to_enc.drop(['NumberFalls'], axis=1)
artifacts_path = pt.FALL_TEST_EMB_DIR
ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path)
ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path)
df = df.drop(ats_cols + ex_cols, axis=1)
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
if target_name in ["Complete", "Compliance", "Fall"]:
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
else:
df = pd.concat([df.drop(target_name, axis=1), df_rand, ats_enc, ex_enc,
df.pop(target_name)], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{target_name.lower()}_emb.csv')
def encode_dataframe(df, target_name, batch_size, train_ratio, epochs,
optimizer, network_layers, verbose, model_path):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
......
......@@ -18,22 +18,30 @@ def main(ats_resolution: int = None):
settings = yaml.safe_load(stream)
if ats_resolution == None:
ats_resolution = settings['ats_resolution']
ex_resolution = settings['ex_resolution']
standard_fts = settings['standard_features']
extended_fts = settings['extended_features']
for target_name in ['Complete', 'Compliance', 'Fall']:
for target_name in ['Complete', 'Compliance', 'Fall', 'Risk']:
df = screenings.copy()
df['Cluster'] = clusters['Cluster']
# Split cat columns by ATS resolution
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=ats_resolution)
if target_name == "Risk":
df = preprocessor.split_cat_columns(df, col_to_split='Ex', tag='Ex',
resolution=ex_resolution)
# Encode target label
if target_name == 'Complete':
df = feature_maker.make_complete_feature(df, settings)
elif target_name == 'Compliance':
df = feature_maker.make_compliance_feature(df, settings)
else:
elif target_name == 'Fall':
df = feature_maker.make_fall_feature(df, settings)
else:
df = feature_maker.make_risk_feature(df, fall_data, settings)
# One-hot-encode gender variable
object_cols = ['Gender']
......@@ -41,47 +49,27 @@ def main(ats_resolution: int = None):
df = pd.concat([df.drop(object_cols, axis=1), df_enc], axis=1)
# Concat dataframe in proper order
ats_cols = df.filter(regex='Ats', axis=1)
general_cols = df[['Gender_Male', 'Gender_Female',
'BirthYear', 'Cluster', 'LoanPeriod']]
df = pd.concat([general_cols, ats_cols, df[[target_name]]], axis=1)
if target_name in ["Complete", "Compliance", "Fall"]:
ats_cols = df.filter(regex='Ats', axis=1)
df = pd.concat([df[standard_fts], ats_cols, df[[target_name]]], axis=1)
else:
ats_ex_cols = df.filter(regex='Ats|Ex', axis=1)
df = pd.concat([df[extended_fts], ats_ex_cols, df[[target_name]]], axis=1)
if settings['use_real_ats_names']:
ats = file_reader.read_csv(pt.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = preprocessor.replace_cat_values(df, ats)
if target_name in ["Complete", "Compliance", "Fall"]:
ats = file_reader.read_csv(pt.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
df = preprocessor.replace_cat_values(df, ats)
else:
ats = file_reader.read_csv(pt.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
ex = file_reader.read_csv(pt.REFERENCES_DIR, 'ex.csv',
converters={'ex_id': str})
df = preprocessor.replace_cat_values(df, ats)
df = preprocessor.replace_cat_values(df, ex)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{target_name.lower()}.csv')
def make_fall_test_case(df, clusters, fall_data, use_real_ats_names, ats_resolution):
df['Cluster'] = clusters['Cluster']
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
resolution=ats_resolution)
df = preprocessor.split_cat_columns(df, col_to_split='Ex', tag='Ex',
resolution=settings['ex_resolution'])
df = feature_maker.make_fall_test_feature(df, fall_data)
ats_ex_cols = df.filter(regex='Ats|Ex', axis=1)
general_cols = df[['Gender', 'BirthYear', 'Cluster', 'LoanPeriod',
'NumberSplit', 'NumberScreening', 'NumberWeeks',
'MeanEvaluation', 'NumberFalls', 'NumberTraining',
'NumberTrainingWeek', 'TimeBetweenTraining',
'NumberWeeksNoTraining', 'Needs', 'Physics']]
df = pd.concat([general_cols, ats_ex_cols, df[['Fall']]], axis=1)
if use_real_ats_names:
ats = file_reader.read_csv(pt.REFERENCES_DIR, 'ats.csv',
converters={'ats_id': str})
ex = file_reader.read_csv(pt.REFERENCES_DIR, 'ex.csv',
converters={'ex_id': str})
df = preprocessor.replace_cat_values(df, ats)
df = preprocessor.replace_cat_values(df, ex)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'fall_test.csv')
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -48,7 +48,7 @@ def main(dataset_version : str = 'emb'):
X = df.drop([target_name], axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y, random_state=0)
neg, pos = np.bincount(y)
......
......@@ -94,53 +94,13 @@ class FallDataLoader(BaseDataLoader):
self.y = y
return self
class FallTestDataLoader(BaseDataLoader):
class RiskDataLoader(BaseDataLoader):
def load_data(self):
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
self.file_name,
converters=self.converters)
X = df.drop(['Fall'], axis=1)
y = df['Fall']
X = df.drop(['Risk'], axis=1)
y = df['Risk']
self.X = X
self.y = y
return self
def prepare_data(self, scaling_strategy: str = None):
X = np.array(self.X)
y = np.array(self.y)
if scaling_strategy != None:
emb_cols = self.X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
return X, y
def prepare_data_split(self, test_size: float,
scaling_strategy: str = None):
X = np.array(self.X)
y = np.array(self.y)
if scaling_strategy != None:
emb_cols = self.X.filter(regex='((\d+)[Ats|Ex])\w+', axis=1)
n_scale_cols = self.X.shape[1] - emb_cols.shape[1]
if scaling_strategy == "Standard":
scaler = StandardScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
else:
scaler = MinMaxScaler()
X_sc = scaler.fit_transform(X[:,:n_scale_cols])
X = np.concatenate([X_sc, X[:,n_scale_cols:]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
stratify=y, random_state=0)
return X_train, X_test, y_train, y_test
\ No newline at end of file
return self
\ No newline at end of file
......@@ -8,10 +8,10 @@ from typing import Tuple
from utility import data_dto
from pandas.tseries.offsets import DateOffset
def annotate_falls(row, digi_db, cura_db):
def annotate_falls(row, digi_db, cura_db, risk_period):
citizen_id = row['CitizenId']
current_date = pd.Timestamp(row['EndDate'])
end_date = current_date + DateOffset(months=6)
end_date = current_date + DateOffset(months=risk_period)
digi_db['EndDate'] = pd.to_datetime(digi_db['EndDate'])
cura_db['Date'] = pd.to_datetime(cura_db['Date'])
......@@ -28,16 +28,18 @@ def annotate_falls(row, digi_db, cura_db):
return 1
return 0
def make_fall_test_feature(df: pd.DataFrame,
cura_falls: pd.DataFrame):
def make_risk_feature(df: pd.DataFrame,
cura_falls: pd.DataFrame,
settings: dict):
cura_falls = cura_falls[['CitizenId', 'Date']]
digi_falls = df[['CitizenId', 'NeedsReason', 'PhysicsReason', 'EndDate']]
digi_falls = digi_falls.fillna('Ingen')
digi_falls = digi_falls[digi_falls['NeedsReason'].str.contains("Fald/uheld")
| digi_falls['PhysicsReason'].str.contains("Fald/uheld")]
df['Fall'] = df[['CitizenId', 'EndDate']].apply(lambda x:
annotate_falls(x, digi_falls, cura_falls), axis=1)
risk_period = settings['risk_period_months']
df['Risk'] = df[['CitizenId', 'EndDate']].apply(lambda x:
annotate_falls(x, digi_falls, cura_falls, risk_period), axis=1)
return df
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment