cleaner.py 7.21 KB
Newer Older
1
2
3
4
5
import config as cfg
import pandas as pd
import numpy as np
import os
import abc
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
6
from typing import List
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

class BaseCleaner(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def clean_clusters(self, cl):
        """Cleans the cluster data set"""
    
    @abc.abstractmethod
    def clean_patient_data(self, ptd):
        """Cleans the patient data set"""
    
    @abc.abstractmethod
    def clean_screening_content(self, sc, ptd):
        """Cleans the screening content data set"""
        
    @abc.abstractmethod
    def clean_status_set(self, ss, ptd):
        """Cleans the status set data set"""
    
    @abc.abstractmethod
    def clean_training_done(self, td, ptd):
        """Cleans the training done data set"""
        
    @abc.abstractmethod
    def clean_training_cancelled(self, tc, ptd):
        """Cleans the training cancelled data set"""    

    @abc.abstractmethod
    def clean_assistive_aids(self, ats, ic, ids):
        """Cleans the assistive aids data set"""
        
    @abc.abstractmethod
    def clean_fall_data(self, fd):
        """Cleans the fall set"""

class Cleaner2020(BaseCleaner):
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
42
    def clean_clusters(self, cl: pd.DataFrame) -> pd.DataFrame:
43
44
        return cl
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
45
    def clean_patient_data(self, ptd: pd.DataFrame) -> pd.DataFrame:
46
47
48
        ptd = remove_citizens_without_valid_id(ptd)
        return ptd
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
49
    def clean_screening_content(self, sc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
50
51
52
53
54
        sc = remove_citizens_not_in_patient_data(sc, ptd, cfg.CITIZEN_ID)
        sc = merge_train_and_patient_data(sc, ptd, cfg.CITIZEN_ID)
        sc = sort_dataframe(sc, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
        return sc
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
55
    def clean_status_set(self, ss: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
56
57
58
59
60
        ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.CITIZEN_ID)
        ss = merge_train_and_patient_data(ss, ptd, cfg.CITIZEN_ID)
        ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
        return ss
        
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
61
    def clean_training_done(self, td: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
62
63
64
65
        td = remove_citizens_not_in_patient_data(td, ptd, cfg.CITIZEN_ID)
        td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return td

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
66
    def clean_training_cancelled(self, tc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
67
68
69
70
71
        tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.CITIZEN_ID)
        tc = merge_train_and_patient_data(tc, ptd, cfg.CITIZEN_ID)
        tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return tc
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
72
73
    def clean_assistive_aids(self, ats: pd.DataFrame, ic: pd.DataFrame,
                             ids: List[str]=None) -> pd.DataFrame:
74
75
76
77
78
79
80
81
        ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
        ats = remove_citizens_without_valid_id(ats)
        ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
        ats = remove_deprecated_device_data(ats)
        ats = remove_tainted_histories(ats)
        ats = drop_invalid_devices(ats, ic)
        return ats
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
82
    def clean_fall_data(self, fd: pd.DataFrame) -> pd.DataFrame:
83
84
85
86
87
        fd = remove_citizens_without_valid_id(fd)
        fd = sort_dataframe(fd, [cfg.CITIZEN_ID, cfg.DATE])
        return fd
                
class Cleaner2019(BaseCleaner):
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
88
    def clean_clusters(self, cl: pd.DataFrame) -> pd.DataFrame:
89
90
        return cl

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
91
    def clean_patient_data(self, ptd: pd.DataFrame) -> pd.DataFrame:
92
93
94
        ptd = remove_citizens_without_valid_id(ptd)
        return ptd

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
95
    def clean_screening_content(self, sc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
96
97
98
99
100
101
        sc = remove_citizens_not_in_patient_data(sc, ptd, cfg.PATIENT_ID)
        sc = remove_screenings_without_exercises(sc)
        sc = merge_train_and_patient_data(sc, ptd, cfg.PATIENT_ID)
        sc = sort_dataframe(sc, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
        return sc

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
102
    def clean_status_set(self, ss: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
103
104
105
106
107
        ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.PATIENT_ID)
        ss = merge_train_and_patient_data(ss, ptd, cfg.PATIENT_ID)
        ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
        return ss

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
108
    def clean_training_done(self, td: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
109
110
111
112
113
        td = remove_citizens_not_in_patient_data(td, ptd, cfg.PATIENT_ID)
        td = merge_train_and_patient_data(td, ptd, cfg.PATIENT_ID) 
        td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return td

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
114
    def clean_training_cancelled(self, tc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
115
116
117
118
119
        tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.PATIENT_ID)
        tc = merge_train_and_patient_data(tc, ptd, cfg.PATIENT_ID)
        tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return tc

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
120
121
    def clean_assistive_aids(self, ats: pd.DataFrame, ic: pd.DataFrame,
                             ids:List[str]=None) -> pd.DataFrame:
122
123
124
125
126
127
        ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
        ats = filter_ats_on_ids(ats, ids)
        ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
        ats = remove_rows_with_old_dates(ats, cfg.RETURN_DATE)
        ats = remove_deprecated_device_data(ats)
        ats = remove_tainted_histories(ats)
128
        ats = drop_invalid_devices(ats, ic)
129
130
        return ats

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
131
    def clean_fall_data(self, fd: pd.DataFrame) -> pd.DataFrame:
132
133
        raise NotImplementedError

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
134
def drop_invalid_devices(ats: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
135
136
    return ats[ats[cfg.DEV_ISO_CLASS].isin(iso_classes.DevISOClass)]

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
137
def remove_screenings_without_exercises(df: pd.DataFrame) -> pd.DataFrame:    
138
139
140
    df = df[df[cfg.EXERCISE_CONTENT] != 'nan']
    return df

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
141
142
143
def remove_citizens_not_in_patient_data(train_data: pd.DataFrame,
                                        patient_data: pd.DataFrame,
                                        id: str) -> pd.DataFrame:
144
145
146
    data = train_data[train_data[id].isin(patient_data[id].unique())]
    return data

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
147
def remove_citizens_without_valid_id(df: pd.DataFrame) -> pd.DataFrame:
148
149
150
151
152
153
154
    df = df[df[cfg.CITIZEN_ID] != "0000000000"]
    df = df[df[cfg.CITIZEN_ID] != '0']
    df = df[df[cfg.CITIZEN_ID] != "#VALUE!"]
    df = df[df[cfg.CITIZEN_ID] != 'nan']
    df = df[df[cfg.CITIZEN_ID] != '681']
    return df

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
155
156
157
def merge_train_and_patient_data(train_data: pd.DataFrame,
                                 patient_data: pd.DataFrame,
                                 key: str) -> pd.DataFrame:
158
159
    return pd.merge(train_data, patient_data, on=key)

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
160
def sort_dataframe(data: pd.DataFrame, by: str) -> pd.DataFrame:
161
162
    return data.sort_values(by)

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
163
def filter_ats_on_ids(ats: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
164
165
    return ats[ats[cfg.CITIZEN_ID].isin(ids)]

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
166
def remove_tainted_histories(ats: pd.DataFrame) -> pd.DataFrame:
167
168
169
170
    tained_ids = ats[ats[cfg.DEV_HMI_NUMBER] == '899,999'][cfg.CITIZEN_ID].unique()
    ats = ats[np.logical_not(ats[cfg.CITIZEN_ID].isin(tained_ids))]
    return ats

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
171
def remove_deprecated_device_data(ats: pd.DataFrame) -> pd.DataFrame:
172
173
    return ats[ats[cfg.DEV_HMI_NUMBER] != '899,999']

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
174
def remove_rows_with_old_dates(ats: pd.DataFrame, col: str) -> pd.DataFrame:
175
176
177
    ats[col] = pd.to_datetime(ats[col])
    mask = (ats[col] >= '1900-01-01') & (ats[col] <= pd.Timestamp('today'))
    return ats.loc[mask]