make_dataset_emb.py 9.17 KB
Newer Older
1
#!/usr/bin/env python
2
from pandas.core.arrays import boolean
3
4
5
6
7
8
9
10
import config as cfg
from tools import file_reader, file_writer, feature_maker
from tools import preprocessor, neural_embedder
from utility import embedder
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.decomposition import PCA
thecml's avatar
thecml committed
11
12
13
14
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
15

thecml's avatar
thecml committed
16
17
18
19
20
21
22
23
24
USE_CROSS_VALID = True
ENABLE_EMB_VIZ = False
VERBOSE = False

def main():
    make_complete_emb()
    make_compliance_emb()
    make_fall_emb()
    make_fall_test_emb()
25
    
thecml's avatar
thecml committed
26
27
28
29
30
31
def get_config(df_to_enc, target_name, artifacts_path):
    return {
        "df": df_to_enc,
        "target_name": target_name,
        "train_ratio": 0.8,
        "network_layers": ([128]),
thecml's avatar
thecml committed
32
        "epochs": 200,
thecml's avatar
thecml committed
33
        "batch_size": 32,
thecml's avatar
thecml committed
34
        "verbose": VERBOSE,
thecml's avatar
thecml committed
35
36
37
        "artifacts_path": artifacts_path
    }
    
thecml's avatar
thecml committed
38
def make_complete_emb():
thecml's avatar
thecml committed
39
    target_name = 'Complete'
40
41
42
43
44
45
46
47
48
    ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
    df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
                              'complete.csv',
                              converters=ats)
    
    emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
    n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
    
    df_to_enc = df.iloc[:,n_numerical_cols:]
49
    artifacts_path = cfg.COMPLIANCE_EMB_DIR
50
    
thecml's avatar
thecml committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
    if USE_CROSS_VALID:
        df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
    else:
        df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
    
    ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
    df = df.drop(ats_cols, axis=1)
    df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
    
    if USE_CROSS_VALID:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb_cv.csv')
    else:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'complete_emb.csv')
    
def make_compliance_emb():
thecml's avatar
thecml committed
66
    target_name = 'Compliance'
67
68
69
70
71
72
73
74
75
    ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
    df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
                              f'compliance.csv',
                              converters=ats)
    
    emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
    n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
    
    df_to_enc = df.iloc[:,n_numerical_cols:]
thecml's avatar
thecml committed
76
    artifacts_path = cfg.COMPLIANCE_EMB_DIR
thecml's avatar
thecml committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    
    if USE_CROSS_VALID:
        df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
    else:
        df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
    
    ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
    df = df.drop(ats_cols, axis=1)
    df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
    
    if USE_CROSS_VALID:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb_cv.csv')
    else:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'compliance_emb.csv')
    
def make_fall_emb():
thecml's avatar
thecml committed
93
    target_name = 'Fall'
94
95
    ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
    df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
96
                              f'fall.csv',
97
98
99
100
101
102
                              converters=ats)
    
    emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
    n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
    
    df_to_enc = df.iloc[:,n_numerical_cols:]
thecml's avatar
thecml committed
103
    artifacts_path = cfg.FALL_EMB_DIR
104
    
thecml's avatar
thecml committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    if USE_CROSS_VALID:
        df_enc = encode_dataframe_cv(df_to_enc, target_name, artifacts_path)
    else:
        df_enc = encode_dataframe(df_to_enc, target_name, artifacts_path)
    
    ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
    df = df.drop(ats_cols, axis=1)
    df = pd.concat([df.drop(target_name, axis=1), df_enc, df.pop(target_name)], axis=1)
    
    if USE_CROSS_VALID:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb_cv.csv')
    else:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_emb.csv')
    
def make_fall_test_emb():
120
121
122
123
    ex = {str(i)+'Ex':str for i in range(1, cfg.EX_RESOLUTION+1)}
    ats = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
    converters = {**ex, **ats}
    df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
124
                              f'fall_test.csv',
125
126
                              converters=converters)
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
127
128
129
130
131
    ats_cols = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
    ex_cols = [str(i)+'Ex' for i in range(1, cfg.EX_RESOLUTION+1)]

    df_ats_to_enc = df.filter(regex=f'Fall|((\d+)[Ats])\w+', axis=1)
    df_ats_to_enc = df_ats_to_enc.drop(['NumberFalls'], axis=1)
132
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
133
134
135
    df_ex_to_enc = df.filter(regex=f'Fall|((\d+)[Ex])\w+', axis=1)
    df_ex_to_enc = df_ex_to_enc.drop(['NumberFalls'], axis=1)

136
    artifacts_path = cfg.FALL_TEST_EMB_DIR
thecml's avatar
thecml committed
137
138
139
140
141
142
143
    
    if USE_CROSS_VALID:
        ats_enc = encode_dataframe_cv(df_ats_to_enc, 'Fall', artifacts_path)
        ex_enc = encode_dataframe_cv(df_ex_to_enc, 'Fall', artifacts_path)
    else:
        ats_enc = encode_dataframe(df_ats_to_enc, 'Fall', artifacts_path)
        ex_enc = encode_dataframe(df_ex_to_enc, 'Fall', artifacts_path)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
144
145
146
147
    
    df = df.drop(ats_cols + ex_cols, axis=1)
    df = pd.concat([df.drop('Fall', axis=1), ats_enc, ex_enc, df.pop('Fall')], axis=1)
        
thecml's avatar
thecml committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    if USE_CROSS_VALID:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb_cv.csv')
    else:
        file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, 'fall_test_emb.csv')

def encode_dataframe_cv(df_to_enc, target_name, artifacts_path):
    params = get_config(df_to_enc, target_name, artifacts_path)
        
    X, y = preprocessor.get_X_y(df_to_enc, target_name)
    X, labels = preprocessor.encode_vector_label(X)
    y = np.array(y)
    
    network = neural_embedder.NeuralEmbedder(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                mode='min',
                                                patience=3,
                                                verbose=1)
    
    weights = []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X[train_index,:], X[valid_index,:]
        y_train, y_valid = y[train_index], y[valid_index]
        
        _ = network.fit(X_train, y_train, X_valid, y_valid,
                        callbacks=[callback])
        embedded_weights = network.get_embedded_weights()
        weights.append(embedded_weights)
        
    new_weights = list()
    for weights_list_tuple in zip(*weights):
        new_weights.append(
            [np.array(weights_).mean(axis=0)\
                for weights_ in zip(*weights_list_tuple)])
    
    network.save_weights(new_weights)
    network.save_labels(labels)
        
    if ENABLE_EMB_VIZ:
        network.make_visualizations_from_network(extension='png')

    df_to_enc = df_to_enc.drop(target_name, axis=1)
    for index in range(df_to_enc.shape[1] - 1):
        column = df_to_enc.columns[index]
        labels_column = labels[index]
        embeddings_column = new_weights[index]
        pca = PCA(n_components=1)
        Y = pca.fit_transform(embeddings_column)
        y_array = np.concatenate(Y)
        mapping = dict(zip(labels_column.classes_, y_array))
        file_writer.write_mapping(mapping,
                                Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
                                f'{target_name.lower()}_{column}.csv')
        df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
        
    return df_to_enc
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
204

thecml's avatar
thecml committed
205
def encode_dataframe(df_to_enc, target_name, artifacts_path):
thecml's avatar
thecml committed
206
    params = get_config(df_to_enc, target_name, artifacts_path)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
207
        
208
209
    X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df_to_enc,
                                                                                    target_name,
thecml's avatar
thecml committed
210
                                                                                    params['train_ratio'])
211
    
212
213
214
215
216
217
218
    network = neural_embedder.NeuralEmbedder(**params)
    network.fit(X_train, y_train, X_val, y_val)
    network.save_model()
    embedded_weights = network.get_embedded_weights()
    network.save_weights(embedded_weights)
    network.save_labels(labels)
    
thecml's avatar
thecml committed
219
    if ENABLE_EMB_VIZ:
220
221
        network.make_visualizations_from_network(extension='png')

222
    df_to_enc = df_to_enc.drop(target_name, axis=1)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
223
224
    for index in range(df_to_enc.shape[1] - 1):
        column = df_to_enc.columns[index]
225
226
227
228
229
230
231
        labels_column = labels[index]
        embeddings_column = embedded_weights[index]
        pca = PCA(n_components=1)
        Y = pca.fit_transform(embeddings_column)
        y_array = np.concatenate(Y)
        mapping = dict(zip(labels_column.classes_, y_array))
        file_writer.write_mapping(mapping,
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
232
                                Path.joinpath(cfg.PROCESSED_DATA_DIR, 'embeddings'),
233
                                f'{target_name.lower()}_{column}.csv')
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
234
235
236
        df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
        
    return df_to_enc
237
238
239

if __name__ == "__main__":
    main()