make_dataset_ordinal.py 3.22 KB
Newer Older
thecml's avatar
thecml committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
import paths as pt
from tools import file_reader, file_writer
from tools import preprocessor
from utility import embedder
import pandas as pd
import numpy as np
from pathlib import Path
from utility.settings import load_settings
from io import StringIO
import shutil
from sklearn.preprocessing import OrdinalEncoder

def main():
    for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
        data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
        ats_resolution = data_settings['ats_resolution']

        if label_name == "Risk":
            target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
            ex_resolution = target_settings['ex_resolution']

        if label_name in ["Complete", "Compliance", "Fall"]:
            ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
            infile = StringIO()
            file_path = pt.PROCESSED_DATA_DIR
            file_name = f'{label_name.lower()}.csv'
            with open(Path.joinpath(file_path, file_name), 'r') as fd:
                shutil.copyfileobj(fd, infile)
                infile.seek(0)
                df = file_reader.read_csv(infile, converters=ats)
        else:
            ex = {str(i)+'Ex':str for i in range(1, ex_resolution+1)}
            ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
            converters = {**ex, **ats}
            infile = StringIO()
            file_path = pt.PROCESSED_DATA_DIR
            file_name = f'{label_name.lower()}.csv'
            with open(Path.joinpath(file_path, file_name), 'r') as fd:
                shutil.copyfileobj(fd, infile)
                infile.seek(0)
                df = file_reader.read_csv(infile, converters=converters)

        if label_name in ["Complete", "Compliance", "Fall"]:
            ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
            oenc = OrdinalEncoder()
            oenc.fit(df[ats_cols].astype(str))
            df_enc = oenc.transform(df[ats_cols].astype(str))
            df_stack = np.column_stack((df.drop(ats_cols + [label_name], axis=1).values,
                                        df_enc, df[[label_name]].values))
            feature_names = df.columns.tolist()
            df = pd.DataFrame(df_stack, columns=feature_names)
        else:
            ex_cols = [str(i)+'Ex' for i in range(1, ex_resolution+1)]
            ats_cols = [str(i)+'Ats' for i in range(1, ats_resolution+1)]
            total_cols = ex_cols + ats_cols
            oenc = OrdinalEncoder()
            oenc.fit(df[total_cols].astype(str))
            df_enc = oenc.transform(df[total_cols].astype(str))
            df_stack = np.column_stack((df.drop(total_cols + [label_name], axis=1).values,
                                        df_enc, df[[label_name]].values))
            feature_names = df.columns.tolist()
            df = pd.DataFrame(df_stack, columns=feature_names)

        outfile = StringIO()
        file_path = pt.PROCESSED_DATA_DIR
        file_name = f'{label_name.lower()}_ordinal.csv'
        with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
            file_writer.write_csv(df, outfile)
            outfile.seek(0)
            shutil.copyfileobj(outfile, fd)

if __name__ == "__main__":
    main()