Skip to content
Snippets Groups Projects
Commit 90f7a573 authored by thecml's avatar thecml
Browse files

added yaml cfgs for embedder

parent c32ed0b1
No related branches found
No related tags found
No related merge requests found
Pipeline #64250 failed
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Complete"
model_path: models/complete/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 5
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Compliance"
model_path: models/compliance/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 20
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
---
# Dataset Stuff -------------------------------------------------
#
target_name: "Fall"
model_path: models/fall/embeddings
# Training Hyperparams --------------------------------------
train_ratio: 0.8
batch_size: 32
num_epochs: 10
verbose: True
# Network Hyperparams --------------------------------------
network_layers: [128]
\ No newline at end of file
......@@ -3,8 +3,6 @@
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\r\n",
"import config as cfg\r\n",
......@@ -27,7 +25,9 @@
"\r\n",
"file_name = f\"ats full.csv\"\r\n",
"df.to_csv(Path.joinpath(cfg.REFERENCES_DIR, file_name), index=False)"
]
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
......
source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -16,6 +16,7 @@ PATHS_2020 = ['borgere_hmi_Rasmus_BorgerId_Gender_BirthYear.xlsx',
ROOT_DIR = Path(__file__).absolute().parent.parent
MODELS_DIR = Path.joinpath(ROOT_DIR, 'models')
CONFIGS_DIR = Path.joinpath(ROOT_DIR, 'configs')
REFERENCES_DIR = Path.joinpath(ROOT_DIR, 'references')
REPORTS_DIR = Path.joinpath(ROOT_DIR, 'reports')
REPORTS_PLOTS_DIR = Path.joinpath(ROOT_DIR, 'reports/plots')
......
......@@ -12,6 +12,7 @@ import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
import yaml
USE_CROSS_VALID = False
USE_GROUPING = False
......@@ -21,90 +22,41 @@ def main(ats_resolution: int = None):
if ats_resolution == None:
ats_resolution = cfg.ATS_RESOLUTION
make_complete_emb(ats_resolution)
make_compliance_emb(ats_resolution)
make_fall_emb(ats_resolution)
#make_fall_test_emb(ats_resolution)
for target_name in ["Complete", "Compliance", "Fall"]:
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'{target_name.lower()}.csv',
converters=ats)
def get_config(df_to_enc, target_name, artifacts_path,
batch_size=32, epochs=5):
return {
"df": df_to_enc,
"target_name": target_name,
"train_ratio": 0.8,
"network_layers": ([128]),
"epochs": epochs,
"batch_size": batch_size,
"verbose": True,
"artifacts_path": artifacts_path
}
def make_complete_emb(ats_resolution):
target_name = 'Complete'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLETE_EMB_DIR
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=5)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
def make_compliance_emb(ats_resolution):
target_name = 'Compliance'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'compliance.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.COMPLIANCE_EMB_DIR
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=20)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
# Make a df to be encoded
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
# Remove old columns from original df
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
def make_fall_emb(ats_resolution):
target_name = 'Fall'
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
f'fall.csv',
converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
artifacts_path = cfg.FALL_EMB_DIR
ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
df = df.drop(ats_cols, axis=1)
config = get_config(df_to_enc, target_name, artifacts_path, epochs=10)
df_enc = encode_dataframe(config)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
# Load embedded config
with open(Path.joinpath(cfg.CONFIGS_DIR,
f"{target_name.lower()}_emb.yaml"), 'r') as stream:
emb_cfg = yaml.safe_load(stream)
# Encode dataframe given params
model_path = Path.joinpath(cfg.ROOT_DIR, emb_cfg['model_path'])
df_enc = encode_dataframe(df=df_to_enc,
target_name=emb_cfg['target_name'],
batch_size=emb_cfg['batch_size'],
train_ratio=emb_cfg['train_ratio'],
epochs=emb_cfg['num_epochs'],
network_layers=emb_cfg['network_layers'],
verbose=emb_cfg['verbose'],
model_path=model_path)
df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'{target_name.lower()}_emb.csv')
def make_fall_test_emb(ats_resolution):
ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
......@@ -130,16 +82,14 @@ def make_fall_test_emb(ats_resolution):
df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
def encode_dataframe(config):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(config['df'],
config['target_name'],
config['train_ratio'])
network = neural_embedder.NeuralEmbedder(df=config['df'],
target_name=config['target_name'],
epochs=config['epochs'],
batch_size=config['batch_size'],
verbose=config['verbose'],
artifacts_path=config['artifacts_path'])
def encode_dataframe(df, target_name, batch_size, train_ratio,
epochs, network_layers, verbose, model_path):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
target_name,
train_ratio)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
batch_size=batch_size, network_layers=network_layers,
verbose=verbose, model_path=model_path)
network.fit(X_train, y_train, X_val, y_val)
network.save_model()
embedded_weights = network.get_embedded_weights()
......@@ -149,7 +99,7 @@ def encode_dataframe(config):
if ENABLE_EMB_VIZ:
network.make_visualizations_from_network(extension='png')
df_to_enc = config['df'].drop(config['target_name'], axis=1)
df_to_enc = df.drop(target_name, axis=1)
for index in range(df_to_enc.shape[1]):
column = df_to_enc.columns[index]
labels_column = labels[index]
......@@ -158,7 +108,6 @@ def encode_dataframe(config):
Y = pca.fit_transform(embeddings_column)
y_array = np.concatenate(Y)
mapping = dict(zip(labels_column.classes_, y_array))
target_name = config['target_name']
file_writer.write_mapping(mapping,
Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
f'{target_name.lower()}_{column}.csv')
......
......@@ -49,7 +49,7 @@ class NeuralEmbedder:
epochs: int = 10,
batch_size: int = 32,
verbose: bool = False,
artifacts_path: str = 'artifacts'):
model_path: str = 'models'):
self.check_not_empty_dataframe(df)
self.check_target_name(target_name)
......@@ -71,7 +71,7 @@ class NeuralEmbedder:
self.optimizer_fn = optimizer_fn
self.metrics = metrics
self.verbose = verbose
self.artifacts_path = artifacts_path
self.model_path = model_path
self.unique_classes = self.df[self.target_name].nunique()
self.embedded_categories = self.__get_categorial_cols(df, target_name)
......@@ -232,7 +232,7 @@ class NeuralEmbedder:
self.model.set_weights(weights)
def save_model(self) -> None:
self.model.save(self.artifacts_path)
self.model.save(self.model_path)
def get_embedded_weights(self) -> List:
weights_embeddings = []
......@@ -254,28 +254,28 @@ class NeuralEmbedder:
Used to return the path of the stored weights
:return: the pah of the stored weights on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_WEIGHTS_FILENAME)
return Path.joinpath(self.model_path, self.DEFAULT_WEIGHTS_FILENAME)
def get_labels_path(self) -> Path:
"""
Used to return the path of the stored labels
:return: the pah of the stored labels on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_LABELS_FILENAME)
return Path.joinpath(self.model_path, self.DEFAULT_LABELS_FILENAME)
def get_scaler_path(self) -> Path:
"""
Used to return the path of the stored scaler
:return: the pah of the stored scaler on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_SCALER_FILENAME)
return Path.joinpath(self.model_path, self.DEFAULT_SCALER_FILENAME)
def get_visualizations_dir(self) -> Path:
"""
Used to return the path of the stored visualizations
:return: the pah of the stored visualizations on disk
"""
return Path.joinpath(self.artifacts_path, self.DEFAULT_PATH_VISUALIZATIONS)
return Path.joinpath(self.model_path, self.DEFAULT_PATH_VISUALIZATIONS)
def save_weights(self, weights: List) -> None:
with open(self.get_weights_path(), 'wb') as f:
......
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment