make_dataset_emb.py 7.63 KB
Newer Older
1
#!/usr/bin/env python
2
from tools import file_reader, file_writer
3
4
5
from tools import preprocessor, neural_embedder
import pandas as pd
import numpy as np
6
import paths as pt
7
8
from pathlib import Path
from sklearn.decomposition import PCA
thecml's avatar
thecml committed
9
10
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
thecml's avatar
thecml committed
11
import yaml
12

thecml's avatar
thecml committed
13
USE_CROSS_VALID = False
14
USE_GROUPING = False
thecml's avatar
thecml committed
15
16
ENABLE_EMB_VIZ = False

17
def main(ats_resolution: int = None):
18
19
20
    with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
        settings = yaml.safe_load(stream)
    
21
    if ats_resolution == None:
22
        ats_resolution = settings['ats_resolution']
23
    
thecml's avatar
thecml committed
24
25
    for target_name in ["Complete", "Compliance", "Fall"]:
        ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
26
        df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
thecml's avatar
thecml committed
27
28
                                f'{target_name.lower()}.csv',
                                converters=ats)
29
    
thecml's avatar
thecml committed
30
31
32
33
34
35
36
37
        # Make a df to be encoded
        emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
        n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
        df_to_enc = df.iloc[:,n_numerical_cols:]
        
        # Remove old columns from original df
        ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
        df = df.drop(ats_cols, axis=1)
thecml's avatar
thecml committed
38

thecml's avatar
thecml committed
39
        # Load embedded config
40
        with open(Path.joinpath(pt.CONFIGS_DIR,
thecml's avatar
thecml committed
41
42
43
44
                                f"{target_name.lower()}_emb.yaml"), 'r') as stream:
            emb_cfg = yaml.safe_load(stream)
    
        # Encode dataframe given params
45
        model_path = Path.joinpath(pt.ROOT_DIR, emb_cfg['model_path'])
thecml's avatar
thecml committed
46
        df_enc = encode_dataframe(df=df_to_enc,
thecml's avatar
thecml committed
47
48
49
50
51
52
53
54
                                  target_name=emb_cfg['target_name'],                              
                                  batch_size=emb_cfg['batch_size'],
                                  train_ratio=emb_cfg['train_ratio'],
                                  epochs=emb_cfg['num_epochs'],
                                  optimizer=emb_cfg['optimizer'],
                                  network_layers=emb_cfg['network_layers'],
                                  verbose=emb_cfg['verbose'],
                                  model_path=model_path)
thecml's avatar
thecml committed
55
56
57
    
        df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand'])
        df = pd.concat([df.drop(target_name, axis=1), df_rand, df_enc, df.pop(target_name)], axis=1)
58
        file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{target_name.lower()}_emb.csv')
thecml's avatar
thecml committed
59
        
60
def make_fall_test_emb(ats_resolution):
61
    ex = {str(i)+'Ex':str for i in range(1, pt.EX_RESOLUTION+1)}
62
    ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
63
    converters = {**ex, **ats}
64
    df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
65
                              f'fall_test.csv',
66
67
                              converters=converters)
    
68
    ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
69
    ex_cols = [str(i)+'Ex' for i in range(1, pt.EX_RESOLUTION+1)]
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
70
71
72

    df_ats_to_enc = df.filter(regex=f'Fall|((\d+)[Ats])\w+', axis=1)
    df_ats_to_enc = df_ats_to_enc.drop(['NumberFalls'], axis=1)
73
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
74
75
76
    df_ex_to_enc = df.filter(regex=f'Fall|((\d+)[Ex])\w+', axis=1)
    df_ex_to_enc = df_ex_to_enc.drop(['NumberFalls'], axis=1)

77
    artifacts_path = pt.FALL_TEST_EMB_DIR
thecml's avatar
thecml committed
78
79
    ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path)
    ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
80
81
82
    
    df = df.drop(ats_cols + ex_cols, axis=1)
    df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
83
    file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, 'fall_test_emb.csv')
thecml's avatar
thecml committed
84

thecml's avatar
thecml committed
85
86
def encode_dataframe(df, target_name, batch_size, train_ratio, epochs,
                     optimizer, network_layers, verbose, model_path):
thecml's avatar
thecml committed
87
88
89
90
91
    X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
                                                                                    target_name,
                                                                                    train_ratio)
    network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
                                             batch_size=batch_size, network_layers=network_layers,
thecml's avatar
thecml committed
92
                                             optimizer_fn=optimizer, verbose=verbose, model_path=model_path)
thecml's avatar
thecml committed
93
94
95
96
97
98
99
100
101
    network.fit(X_train, y_train, X_val, y_val)
    network.save_model()
    embedded_weights = network.get_embedded_weights()
    network.save_weights(embedded_weights)
    network.save_labels(labels)
    
    if ENABLE_EMB_VIZ:
        network.make_visualizations_from_network(extension='png')

thecml's avatar
thecml committed
102
    df_to_enc = df.drop(target_name, axis=1)
thecml's avatar
thecml committed
103
104
105
106
107
108
109
110
111
    for index in range(df_to_enc.shape[1]):
        column = df_to_enc.columns[index]
        labels_column = labels[index]
        embeddings_column = embedded_weights[index]
        pca = PCA(n_components=1)
        Y = pca.fit_transform(embeddings_column)
        y_array = np.concatenate(Y)
        mapping = dict(zip(labels_column.classes_, y_array))
        file_writer.write_mapping(mapping,
112
                                Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings'),
thecml's avatar
thecml committed
113
114
115
116
117
                                f'{target_name.lower()}_{column}.csv')
        df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
        
    return df_to_enc

thecml's avatar
thecml committed
118
119
120
def encode_dataframe_cv(df, target_name, batch_size, train_ratio,
                        epochs, network_layers, verbose, model_path):        
    X, y = preprocessor.get_X_y(df, target_name)
thecml's avatar
thecml committed
121
122
    X, labels = preprocessor.encode_vector_label(X)
    y = np.array(y)
thecml's avatar
thecml committed
123
124
125
126

    network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
                                             batch_size=batch_size, network_layers=network_layers,
                                             verbose=verbose, model_path=model_path)
thecml's avatar
thecml committed
127
128
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
129
130
131
132
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                   mode='min',
                                                   patience=3,
                                                   verbose=0)
thecml's avatar
thecml committed
133
134
135
136
137
138
139
    
    weights = []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X[train_index,:], X[valid_index,:]
        y_train, y_valid = y[train_index], y[valid_index]
        
        _ = network.fit(X_train, y_train, X_valid, y_valid,
140
                        callbacks=[es_callback])
thecml's avatar
thecml committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
        embedded_weights = network.get_embedded_weights()
        weights.append(embedded_weights)
        
    new_weights = list()
    for weights_list_tuple in zip(*weights):
        new_weights.append(
            [np.array(weights_).mean(axis=0)\
                for weights_ in zip(*weights_list_tuple)])
    
    network.save_weights(new_weights)
    network.save_labels(labels)
        
    if ENABLE_EMB_VIZ:
        network.make_visualizations_from_network(extension='png')

thecml's avatar
thecml committed
156
    df_to_enc = df.drop(target_name, axis=1)
thecml's avatar
thecml committed
157
    for index in range(df_to_enc.shape[1]):
thecml's avatar
thecml committed
158
159
160
161
162
163
164
165
        column = df_to_enc.columns[index]
        labels_column = labels[index]
        embeddings_column = new_weights[index]
        pca = PCA(n_components=1)
        Y = pca.fit_transform(embeddings_column)
        y_array = np.concatenate(Y)
        mapping = dict(zip(labels_column.classes_, y_array))
        file_writer.write_mapping(mapping,
166
                                Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings'),
thecml's avatar
thecml committed
167
168
169
170
                                f'{target_name.lower()}_{column}.csv')
        df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
        
    return df_to_enc
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
171

172
173
if __name__ == "__main__":
    main()