make_dataset_emb.py 8.98 KB
Newer Older
1
#!/usr/bin/env python
2
from tools import file_reader, file_writer
3
4
5
from tools import preprocessor, neural_embedder
import pandas as pd
import numpy as np
6
import paths as pt
7
8
from pathlib import Path
from sklearn.decomposition import PCA
thecml's avatar
thecml committed
9
10
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
thecml's avatar
thecml committed
11
import yaml
12

thecml's avatar
thecml committed
13
USE_CROSS_VALID = False
14
USE_GROUPING = False
thecml's avatar
thecml committed
15
16
ENABLE_EMB_VIZ = False

17
def main(ats_resolution: int = None):        
18
    for label_name in ["Complete", "Compliance", "Alarm", "Fall"]:
19
        with open(Path.joinpath(pt.CONFIGS_DIR,
20
                                f'{label_name.lower()}_emb.yaml'), 'r') as stream:
21
            settings = yaml.safe_load(stream)
22
            
23
24
        if ats_resolution == None:
            ats_resolution = settings['ats_resolution']
25
        if label_name == "Fall":
26
27
            ex_resolution = settings['ex_resolution']
        
28
        if label_name in ["Complete", "Compliance", "Alarm"]:
29
30
            ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
            df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
31
                                    f'{label_name.lower()}.csv',
32
33
34
35
36
37
                                    converters=ats)
        else:
            ex = {str(i)+'Ex':str for i in range(1, ex_resolution+1)}
            ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
            converters = {**ex, **ats}
            df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
38
                                    f'{label_name.lower()}.csv',
39
40
                                    converters=converters)
        
41
        if label_name in ["Complete", "Compliance", "Alarm"]:
42
43
44
45
46
47
48
49
            emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
            n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
            df_to_enc = df.iloc[:,n_numerical_cols:] 
            ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
            df = df.drop(ats_cols, axis=1)
        else:
            ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
            ex_cols = [str(i)+'Ex' for i in range(1, ex_resolution+1)]
50
51
            df_ats_to_enc = df.filter(regex=f'Fall|((\d+)[Ats])\w+', axis=1)
            df_ex_to_enc = df.filter(regex=f'Fall|((\d+)[Ex])\w+', axis=1)
52
            df = df.drop(ats_cols + ex_cols, axis=1)
thecml's avatar
thecml committed
53
54
        
        # Load embedded config
55
        with open(Path.joinpath(pt.CONFIGS_DIR,
56
                                f"{label_name.lower()}_emb.yaml"), 'r') as stream:
thecml's avatar
thecml committed
57
58
59
            emb_cfg = yaml.safe_load(stream)
    
        # Encode dataframe given params
60
        model_path = Path.joinpath(pt.ROOT_DIR, emb_cfg['model_path'])
61
        if label_name in ["Complete", "Compliance", "Alarm"]:
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
            df_enc = encode_dataframe(df=df_to_enc,
                                    target_name=emb_cfg['target_name'],
                                    batch_size=emb_cfg['batch_size'],
                                    train_ratio=emb_cfg['train_ratio'],
                                    epochs=emb_cfg['num_epochs'],
                                    optimizer=emb_cfg['optimizer'],
                                    network_layers=emb_cfg['network_layers'],
                                    verbose=emb_cfg['verbose'],
                                    model_path=model_path)
        else:
            ats_enc = encode_dataframe(df=df_ats_to_enc,
                                    target_name=emb_cfg['target_name'],
                                    batch_size=emb_cfg['batch_size'],
                                    train_ratio=emb_cfg['train_ratio'],
                                    epochs=emb_cfg['num_epochs_ats'],
                                    optimizer=emb_cfg['optimizer'],
                                    network_layers=emb_cfg['network_layers'],
                                    verbose=emb_cfg['verbose'],
                                    model_path=model_path)
            ex_enc = encode_dataframe(df=df_ex_to_enc,
                                    target_name=emb_cfg['target_name'],
                                    batch_size=emb_cfg['batch_size'],
                                    train_ratio=emb_cfg['train_ratio'],
                                    epochs=emb_cfg['num_epochs_ex'],
                                    optimizer=emb_cfg['optimizer'],
                                    network_layers=emb_cfg['network_layers'],
                                    verbose=emb_cfg['verbose'],
                                    model_path=model_path)
thecml's avatar
thecml committed
90
    
91
        df_rand = pd.DataFrame(np.random.rand(len(df),1), columns=['Rand']) # add random var
thecml's avatar
thecml committed
92
        
93
94
        if label_name in ["Complete", "Compliance", "Alarm"]:
            df = pd.concat([df.drop(label_name, axis=1), df_rand, df_enc, df.pop(label_name)], axis=1)
95
        else:
96
97
            df = pd.concat([df.drop(label_name, axis=1), df_rand, ats_enc, ex_enc,
                            df.pop(label_name)], axis=1)
thecml's avatar
thecml committed
98

99
        file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{label_name.lower()}_emb.csv')
100
        
thecml's avatar
thecml committed
101
102
def encode_dataframe(df, target_name, batch_size, train_ratio, epochs,
                     optimizer, network_layers, verbose, model_path):
thecml's avatar
thecml committed
103
104
105
106
107
    X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
                                                                                    target_name,
                                                                                    train_ratio)
    network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
                                             batch_size=batch_size, network_layers=network_layers,
thecml's avatar
thecml committed
108
                                             optimizer_fn=optimizer, verbose=verbose, model_path=model_path)
thecml's avatar
thecml committed
109
110
111
112
113
114
115
116
117
    network.fit(X_train, y_train, X_val, y_val)
    network.save_model()
    embedded_weights = network.get_embedded_weights()
    network.save_weights(embedded_weights)
    network.save_labels(labels)
    
    if ENABLE_EMB_VIZ:
        network.make_visualizations_from_network(extension='png')

thecml's avatar
thecml committed
118
    df_to_enc = df.drop(target_name, axis=1)
thecml's avatar
thecml committed
119
120
121
122
123
124
125
126
127
    for index in range(df_to_enc.shape[1]):
        column = df_to_enc.columns[index]
        labels_column = labels[index]
        embeddings_column = embedded_weights[index]
        pca = PCA(n_components=1)
        Y = pca.fit_transform(embeddings_column)
        y_array = np.concatenate(Y)
        mapping = dict(zip(labels_column.classes_, y_array))
        file_writer.write_mapping(mapping,
128
                                Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings'),
thecml's avatar
thecml committed
129
130
131
132
133
                                f'{target_name.lower()}_{column}.csv')
        df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
        
    return df_to_enc

thecml's avatar
thecml committed
134
135
136
def encode_dataframe_cv(df, target_name, batch_size, train_ratio,
                        epochs, network_layers, verbose, model_path):        
    X, y = preprocessor.get_X_y(df, target_name)
thecml's avatar
thecml committed
137
138
    X, labels = preprocessor.encode_vector_label(X)
    y = np.array(y)
thecml's avatar
thecml committed
139
140
141
142

    network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name, epochs=epochs,
                                             batch_size=batch_size, network_layers=network_layers,
                                             verbose=verbose, model_path=model_path)
thecml's avatar
thecml committed
143
144
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
145
146
147
148
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                   mode='min',
                                                   patience=3,
                                                   verbose=0)
thecml's avatar
thecml committed
149
150
151
152
153
154
155
    
    weights = []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X[train_index,:], X[valid_index,:]
        y_train, y_valid = y[train_index], y[valid_index]
        
        _ = network.fit(X_train, y_train, X_valid, y_valid,
156
                        callbacks=[es_callback])
thecml's avatar
thecml committed
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
        embedded_weights = network.get_embedded_weights()
        weights.append(embedded_weights)
        
    new_weights = list()
    for weights_list_tuple in zip(*weights):
        new_weights.append(
            [np.array(weights_).mean(axis=0)\
                for weights_ in zip(*weights_list_tuple)])
    
    network.save_weights(new_weights)
    network.save_labels(labels)
        
    if ENABLE_EMB_VIZ:
        network.make_visualizations_from_network(extension='png')

thecml's avatar
thecml committed
172
    df_to_enc = df.drop(target_name, axis=1)
thecml's avatar
thecml committed
173
    for index in range(df_to_enc.shape[1]):
thecml's avatar
thecml committed
174
175
176
177
178
179
180
181
        column = df_to_enc.columns[index]
        labels_column = labels[index]
        embeddings_column = new_weights[index]
        pca = PCA(n_components=1)
        Y = pca.fit_transform(embeddings_column)
        y_array = np.concatenate(Y)
        mapping = dict(zip(labels_column.classes_, y_array))
        file_writer.write_mapping(mapping,
182
                                Path.joinpath(pt.PROCESSED_DATA_DIR, 'embeddings'),
thecml's avatar
thecml committed
183
184
185
186
                                f'{target_name.lower()}_{column}.csv')
        df_to_enc[column] = df_to_enc[column].replace(to_replace=mapping)
        
    return df_to_enc
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
187

188
189
if __name__ == "__main__":
    main()