cleaner.py 6.02 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# !/usr/bin/env python

import config as cfg
import pandas as pd
import numpy as np
import os
import abc

class BaseCleaner(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def clean_clusters(self, cl):
        """Cleans the cluster data set"""
    
    @abc.abstractmethod
    def clean_patient_data(self, ptd):
        """Cleans the patient data set"""
    
    @abc.abstractmethod
    def clean_screening_content(self, sc, ptd):
        """Cleans the screening content data set"""
        
    @abc.abstractmethod
    def clean_status_set(self, ss, ptd):
        """Cleans the status set data set"""
    
    @abc.abstractmethod
    def clean_training_done(self, td, ptd):
        """Cleans the training done data set"""
        
    @abc.abstractmethod
    def clean_training_cancelled(self, tc, ptd):
        """Cleans the training cancelled data set"""    

    @abc.abstractmethod
    def clean_assistive_aids(self, ats, ic, ids):
        """Cleans the assistive aids data set"""
        
    @abc.abstractmethod
    def clean_fall_data(self, fd):
        """Cleans the fall set"""

class Cleaner2020(BaseCleaner):
    def clean_clusters(self, cl):
        return cl
    
    def clean_patient_data(self, ptd):
        ptd = remove_citizens_without_valid_id(ptd)
        return ptd
    
    def clean_screening_content(self, sc, ptd):
        sc = remove_citizens_not_in_patient_data(sc, ptd, cfg.CITIZEN_ID)
        sc = merge_train_and_patient_data(sc, ptd, cfg.CITIZEN_ID)
        sc = sort_dataframe(sc, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
        return sc
    
    def clean_status_set(self, ss, ptd):
        ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.CITIZEN_ID)
        ss = merge_train_and_patient_data(ss, ptd, cfg.CITIZEN_ID)
        ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
        return ss
        
    def clean_training_done(self, td, ptd):
        td = remove_citizens_not_in_patient_data(td, ptd, cfg.CITIZEN_ID)
        td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return td

    def clean_training_cancelled(self, tc, ptd):
        tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.CITIZEN_ID)
        tc = merge_train_and_patient_data(tc, ptd, cfg.CITIZEN_ID)
        tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return tc
    
    def clean_assistive_aids(self, ats, ic, ids=None):
        ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
        ats = remove_citizens_without_valid_id(ats)
        ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
        ats = remove_deprecated_device_data(ats)
        ats = remove_tainted_histories(ats)
        ats = drop_invalid_devices(ats, ic)
        return ats
    
    def clean_fall_data(self, fd):
        fd = remove_citizens_without_valid_id(fd)
        fd = sort_dataframe(fd, [cfg.CITIZEN_ID, cfg.DATE])
        return fd
                
class Cleaner2019(BaseCleaner):
    def clean_clusters(self, cl):
        return cl

    def clean_patient_data(self, ptd):
        ptd = remove_citizens_without_valid_id(ptd)
        return ptd

    def clean_screening_content(self, sc, ptd):
        sc = remove_citizens_not_in_patient_data(sc, ptd, cfg.PATIENT_ID)
        sc = remove_screenings_without_exercises(sc)
        sc = merge_train_and_patient_data(sc, ptd, cfg.PATIENT_ID)
        sc = sort_dataframe(sc, [cfg.CITIZEN_ID, cfg.SCREENING_DATE])
        return sc

    def clean_status_set(self, ss, ptd):
        ss = remove_citizens_not_in_patient_data(ss, ptd, cfg.PATIENT_ID)
        ss = merge_train_and_patient_data(ss, ptd, cfg.PATIENT_ID)
        ss = sort_dataframe(ss, [cfg.CITIZEN_ID, cfg.CHANGE_DATE])
        return ss

    def clean_training_done(self, td, ptd):
        td = remove_citizens_not_in_patient_data(td, ptd, cfg.PATIENT_ID)
        td = merge_train_and_patient_data(td, ptd, cfg.PATIENT_ID) 
        td = sort_dataframe(td, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return td

    def clean_training_cancelled(self, tc, ptd):
        tc = remove_citizens_not_in_patient_data(tc, ptd, cfg.PATIENT_ID)
        tc = merge_train_and_patient_data(tc, ptd, cfg.PATIENT_ID)
        tc = sort_dataframe(tc, [cfg.CITIZEN_ID, cfg.RATING_DATE])
        return tc

    def clean_assistive_aids(self, ats, ic, ids=None):
        ats = sort_dataframe(ats, [cfg.CITIZEN_ID, cfg.LEND_DATE])
        ats = filter_ats_on_ids(ats, ids)
        ats = remove_rows_with_old_dates(ats, cfg.LEND_DATE)
        ats = remove_rows_with_old_dates(ats, cfg.RETURN_DATE)
        ats = remove_deprecated_device_data(ats)
        ats = remove_tainted_histories(ats)
127
        ats = drop_invalid_devices(ats, ic)
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
        return ats

    def clean_fall_data(self, fd):
        raise NotImplementedError

def drop_invalid_devices(ats, iso_classes):
    return ats[ats[cfg.DEV_ISO_CLASS].isin(iso_classes.DevISOClass)]

def remove_screenings_without_exercises(df):    
    df = df[df[cfg.EXERCISE_CONTENT] != 'nan']
    return df

def remove_citizens_not_in_patient_data(train_data, patient_data, id):
    data = train_data[train_data[id].isin(patient_data[id].unique())]
    return data

def remove_citizens_without_valid_id(df):
    df = df[df[cfg.CITIZEN_ID] != "0000000000"]
    df = df[df[cfg.CITIZEN_ID] != '0']
    df = df[df[cfg.CITIZEN_ID] != "#VALUE!"]
    df = df[df[cfg.CITIZEN_ID] != 'nan']
    df = df[df[cfg.CITIZEN_ID] != '681']
    return df

def merge_train_and_patient_data(train_data, patient_data, key):
    return pd.merge(train_data, patient_data, on=key)

def sort_dataframe(data, by):
    return data.sort_values(by)

def filter_ats_on_ids(ats, ids):
    return ats[ats[cfg.CITIZEN_ID].isin(ids)]

def remove_tainted_histories(ats):
    tained_ids = ats[ats[cfg.DEV_HMI_NUMBER] == '899,999'][cfg.CITIZEN_ID].unique()
    ats = ats[np.logical_not(ats[cfg.CITIZEN_ID].isin(tained_ids))]
    return ats

def remove_deprecated_device_data(ats):
    return ats[ats[cfg.DEV_HMI_NUMBER] != '899,999']

def remove_rows_with_old_dates(ats, col):
    ats[col] = pd.to_datetime(ats[col])
    mask = (ats[col] >= '1900-01-01') & (ats[col] <= pd.Timestamp('today'))
    return ats.loc[mask]