cleaner.py 6.91 KB
Newer Older
1
2
3
import pandas as pd
import numpy as np
import os
4
from abc import ABC, abstractmethod
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
5
from typing import List
6

7
8
class BaseCleaner(ABC):
    @abstractmethod
9
10
11
    def clean_clusters(self, cl):
        """Cleans the cluster data set"""
    
12
    @abstractmethod
13
14
15
    def clean_patient_data(self, ptd):
        """Cleans the patient data set"""
    
16
    @abstractmethod
17
18
19
    def clean_screening_content(self, sc, ptd):
        """Cleans the screening content data set"""
        
20
    @abstractmethod
21
22
23
    def clean_status_set(self, ss, ptd):
        """Cleans the status set data set"""
    
24
    @abstractmethod
25
26
27
    def clean_training_done(self, td, ptd):
        """Cleans the training done data set"""
        
28
    @abstractmethod
29
30
31
    def clean_training_cancelled(self, tc, ptd):
        """Cleans the training cancelled data set"""    

32
    @abstractmethod
33
34
35
    def clean_assistive_aids(self, ats, ic, ids):
        """Cleans the assistive aids data set"""
        
36
37
    def remove_citizens_not_in_patient_data(self, train_data: pd.DataFrame,
                                            patient_data: pd.DataFrame,
thecml's avatar
thecml committed
38
39
40
41
42
43
44
45
46
                                            id_col: str) -> pd.DataFrame:
        """
        This method removes citizens not in patient data set
        :param train_data: DigiRehab training data
        :param patient_data: DigiRehab patient data
        :param id_col: the name of the column identifing a citizen
        :return: cleaned dataframe
        """
        data = train_data[train_data[id_col].isin(patient_data[id_col].unique())]
47
48
        return data

49
    def remove_citizens_without_valid_id(self, df: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
50
51
52
53
54
        """
        This method removes citizens without a valid id
        :param df: a dataframe
        :return: cleaned dataframe
        """
55
56
57
58
59
60
61
        df = df[df['CitizenId'] != "0000000000"]
        df = df[df['CitizenId'] != '0']
        df = df[df['CitizenId'] != "#VALUE!"]
        df = df[df['CitizenId'] != '681']
        df = df.dropna(subset=['CitizenId'])
        return df

62
    def merge_train_and_patient_data(self, train_data: pd.DataFrame,
63
                                    patient_data: pd.DataFrame,
thecml's avatar
thecml committed
64
65
66
67
68
69
70
71
72
                                    id_col: str) -> pd.DataFrame:
        """
        This method merges the training and patient data
        :param train_data: DigiRehab training data
        :param patient_data: DigiRehab patient data
        :param id_col: the name of the column identifing a citizen
        :return: merged dataframe
        """
        return pd.merge(train_data, patient_data, on=id_col)
73

thecml's avatar
thecml committed
74
75
76
77
78
79
80
81
    def sort_dataframe(self, df: pd.DataFrame, by: str) -> pd.DataFrame:
        """
        This method sorts a dataframe based on a column name
        :param df: dataframe to be sorted
        :param by: column name to sort by
        :return: sorted dataframe
        """
        return df.sort_values(by)
82

thecml's avatar
thecml committed
83
84
85
86
87
88
89
90
91
    def filter_ats_on_ids(self, df: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
        """
        This method filters a dataframe containing ats
        data by a list of ids in the dataframe
        :param df: dataframe containing ats
        :param ids: ids to filter by
        :return: filtered dataframe
        """
        return df[df['CitizenId'].isin(ids)]
92

thecml's avatar
thecml committed
93
94
95
96
97
98
99
100
101
    def remove_tainted_histories(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        This method removed tainted loan histories
        :param df: dataframe containing ats
        :return: a cleaned dataframe
        """
        tained_ids = df[df['DevHMINumber'] == '899,999']['CitizenId'].unique()
        df = df[np.logical_not(df['CitizenId'].isin(tained_ids))]
        return df
102

thecml's avatar
thecml committed
103
104
    def remove_deprecated_device_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
105
        This method removed deprecated device data
thecml's avatar
thecml committed
106
107
108
109
        :param df: dataframe containing ats
        :return: a cleaned dataframe
        """
        return df[df['DevHMINumber'] != '899,999']
110

thecml's avatar
thecml committed
111
112
113
114
115
116
117
118
119
120
    def remove_rows_with_old_dates(self, df: pd.DataFrame, date_col: str) -> pd.DataFrame:
        """
        This method removes rows with old dates
        :param df: dataframe containing ats
        :param date_col: the name of the date column
        :return: a cleaned dataframe
        """
        df[date_col] = pd.to_datetime(df[date_col])
        mask = (df[date_col] >= '1900-01-01') & (df[date_col] <= pd.Timestamp('today'))
        return df.loc[mask]
121
    
thecml's avatar
thecml committed
122
123
124
125
126
127
128
129
    def drop_invalid_devices(self, df: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
        """
        This method removes invalid devices not in iso classes
        :param df: dataframe containing ats
        :param iso_classes: dataframe with the iso classes
        :return: a cleaned dataframe
        """
        return df[df['DevISOClass'].isin(iso_classes.DevISOClass)]
130

thecml's avatar
thecml committed
131
132
133
134
135
136
    def remove_screenings_without_exercises(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        This method removes screenings without exercises
        :param df: dataframe containing screenings
        :return: a cleaned dataframe
        """
137
138
        df = df[df['ExerciseContent'] != 'nan']
        return df
139
140

class Cleaner2020(BaseCleaner):
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
141
    def clean_clusters(self, cl: pd.DataFrame) -> pd.DataFrame:
142
143
        return cl
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
144
    def clean_patient_data(self, ptd: pd.DataFrame) -> pd.DataFrame:
145
        ptd = self.remove_citizens_without_valid_id(ptd)
146
147
        return ptd
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
148
    def clean_screening_content(self, sc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
149
150
151
        sc = self.remove_citizens_not_in_patient_data(sc, ptd, 'CitizenId')
        sc = self.merge_train_and_patient_data(sc, ptd, 'CitizenId')
        sc = self.sort_dataframe(sc, ['CitizenId', 'ScreeningDate'])
152
153
        return sc
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
154
    def clean_status_set(self, ss: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
155
156
157
        ss = self.remove_citizens_not_in_patient_data(ss, ptd, 'CitizenId')
        ss = self.merge_train_and_patient_data(ss, ptd, 'CitizenId')
        ss = self.sort_dataframe(ss, ['CitizenId', 'ChangeDate'])
158
159
        return ss
        
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
160
    def clean_training_done(self, td: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
161
162
        td = self.remove_citizens_not_in_patient_data(td, ptd, 'CitizenId')
        td = self.sort_dataframe(td, ['CitizenId', 'RatingDate'])
163
164
        return td

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
165
    def clean_training_cancelled(self, tc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
166
167
168
        tc = self.remove_citizens_not_in_patient_data(tc, ptd, 'CitizenId')
        tc = self.merge_train_and_patient_data(tc, ptd, 'CitizenId')
        tc = self.sort_dataframe(tc, ['CitizenId', 'RatingDate'])
169
170
        return tc
    
thecml's avatar
thecml committed
171
    def clean_assistive_aids(self, ats: pd.DataFrame, ic: pd.DataFrame) -> pd.DataFrame:
172
173
174
175
176
177
        ats = self.sort_dataframe(ats, ['CitizenId', 'LendDate'])
        ats = self.remove_citizens_without_valid_id(ats)
        ats = self.remove_rows_with_old_dates(ats, 'LendDate')
        ats = self.remove_deprecated_device_data(ats)
        ats = self.remove_tainted_histories(ats)
        ats = self.drop_invalid_devices(ats, ic)
178
        return ats