Commit 90f7a573 authored by thecml's avatar thecml
Browse files

added yaml cfgs for embedder

parent c32ed0b1
Pipeline #64250 failed with stage
in 2 minutes and 37 seconds
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Complete"
model_path: models/complete/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 5
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Compliance"
model_path: models/compliance/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 20
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Fall"
model_path: models/fall/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
......@@ -3,8 +3,6 @@
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\r\n",
"import config as cfg\r\n",
......@@ -27,7 +25,9 @@
"\r\n",
"file_name = f\"ats full.csv\"\r\n",
"df.to_csv(Path.joinpath(cfg.REFERENCES_DIR, file_name), index=False)"
]
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -16,6 +16,7 @@ PATHS_2020 = ['borgere_hmi_Rasmus_BorgerId_Gender_BirthYear.xlsx',
ROOT_DIR = Path(__file__).absolute().parent.parent
MODELS_DIR = Path.joinpath(ROOT_DIR, 'models')
CONFIGS_DIR = Path.joinpath(ROOT_DIR, 'configs')
REFERENCES_DIR = Path.joinpath(ROOT_DIR, 'references')
REPORTS_DIR = Path.joinpath(ROOT_DIR, 'reports')
REPORTS_PLOTS_DIR = Path.joinpath(ROOT_DIR, 'reports/plots')
......
......@@ -12,6 +12,7 @@ import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
import yaml
USE_CROSS_VALID = False
USE_GROUPING = False
......@@ -21,90 +22,41 @@ def main(ats_resolution: int = None):
if ats_resolution == None:
ats_resolution = cfg.ATS_RESOLUTION
make_complete_emb(ats_resolution)
make_compliance_emb(ats_resolution)
make_fall_emb(ats_resolution)
#make_fall_test_emb(ats_resolution)
for target_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'{target_name.lower()}.csv',
converters=ats)
def get_config(df_to_enc, target_name, artifacts_path,
batch_size=32, epochs=5):
return {
"df": df_to_enc,
"target_name": target_name,
"train_ratio": 0.8,
"network_layers": ([128]),
"epochs": epochs,
"batch_size": batch_size,
"verbose": True,
"artifacts_path": artifacts_path
}
def make_complete_emb(ats_resolution):
target_name = 'Complete'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLETE_EMB_DIR
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=5)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb(ats_resolution):
target_name = 'Compliance'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'compliance.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=20)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
# Make a df to be encoded
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
# Remove old columns from original df
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
def make_fall_emb(ats_resolution):
target_name = 'Fall'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'fall.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.FALL_EMB_DIR
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=10)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
# Load embedded config
with open(Path.joinpath(cfg.CONFIGS_DIR,
f"{target_name.lower()}_emb.yaml"), 'r') as stream:
emb_cfg = yaml.safe_load(stream)
# Encode dataframe given params
model_path = Path.joinpath(cfg.ROOT_DIR, emb_cfg['model_path'])
df_enc = encode_dataframe(df=df_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'{target_name.lower()}_emb.csv')
def make_fall_test_emb(ats_resolution):
ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
......@@ -130,16 +82,14 @@ def make_fall_test_emb(ats_resolution):
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe(config):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(config['df'],
config['target_name'],
config['train_ratio'])
network = neural_embedder.NeuralEmbedder(df=config['df'],
target_name=config['target_name'],
epochs=config['epochs'],
batch_size=config['batch_size'],
verbose=config['verbose'],
artifacts_path=config['artifacts_path'])
def encode_dataframe(df, target_name, batch_size, train_ratio,
epochs, network_layers, verbose, model_path):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
target_name,
train_ratio)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
batch_size=batch_size, network_layers=network_layers,
verbose=verbose, model_path=model_path)
network.fit(X_train, y_train, X_val, y_val)
network.save_model()
embedded_weights = network.get_embedded_weights()
......@@ -149,7 +99,7 @@ def encode_dataframe(config):
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = config['df'].drop(config['target_name'], axis=1)
df_to_enc = df.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
......@@ -158,7 +108,6 @@ def encode_dataframe(config):
Y = pca.fit_transform(embeddings_column)
y_array = np.concatenate(Y)
mapping = dict(zip(labels_column.classes_, y_array))
target_name = config['target_name']
file_writer.write_mapping(mapping,
Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
f'{target_name.lower()}_{column}.csv')
......
......@@ -49,7 +49,7 @@ class NeuralEmbedder:
epochs: int = 10,
batch_size: int = 32,
verbose: bool = False,
artifacts_path: str = 'artifacts'):
model_path: str = 'models'):
self.check_not_empty_dataframe(df)
self.check_target_name(target_name)
......@@ -71,7 +71,7 @@ class NeuralEmbedder:
self.optimizer_fn = optimizer_fn
self.metrics = metrics
self.verbose = verbose
self.artifacts_path = artifacts_path
self.model_path = model_path
self.unique_classes = self.df[self.target_name].nunique()
self.embedded_categories = self.__get_categorial_cols(df, target_name)
......@@ -232,7 +232,7 @@ class NeuralEmbedder:
self.model.set_weights(weights)
def save_model(self) -> None:
self.model.save(self.artifacts_path)
self.model.save(self.model_path)
def get_embedded_weights(self) -> List:
weights_embeddings = []
......@@ -254,28 +254,28 @@ class NeuralEmbedder:
Used to return the path of the stored weights
:return: the pah of the stored weights on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_WEIGHTS_FILENAME)
return Path.joinpath(self.model_path, self.DEFAULT_WEIGHTS_FILENAME)
def get_labels_path(self) -> Path:
"""
Used to return the path of the stored labels
:return: the pah of the stored labels on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_LABELS_FILENAME)
return Path.joinpath(self.model_path, self.DEFAULT_LABELS_FILENAME)
def get_scaler_path(self) -> Path:
"""
Used to return the path of the stored scaler
:return: the pah of the stored scaler on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_SCALER_FILENAME)
return Path.joinpath(self.model_path, self.DEFAULT_SCALER_FILENAME)
def get_visualizations_dir(self) -> Path:
"""
Used to return the path of the stored visualizations
:return: the pah of the stored visualizations on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_PATH_VISUALIZATIONS)
return Path.joinpath(self.model_path, self.DEFAULT_PATH_VISUALIZATIONS)
def save_weights(self, weights: List) -> None:
with open(self.get_weights_path(), 'wb') as f:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment