cleaner.py 10.7 KB
Newer Older
thecml's avatar
thecml committed
1
2
3
4
5
6
"""
cleaner.py
====================================
Module to clean raw data.
"""

7
from abc import ABC, abstractmethod
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
8
from typing import List
thecml's avatar
thecml committed
9
10
import pandas as pd
import numpy as np
11

12
class BaseCleaner(ABC):
thecml's avatar
thecml committed
13
14
15
    """
    Base class for cleaners.
    """
16
    @abstractmethod
thecml's avatar
thecml committed
17
18
19
    def clean_patient_data(self, patient_data):
        """Cleans the patient data set."""

20
    @abstractmethod
thecml's avatar
thecml committed
21
22
23
    def clean_screening_content(self, screening_content, patient_data):
        """Cleans the screening content data set."""

24
    @abstractmethod
thecml's avatar
thecml committed
25
26
27
    def clean_status_set(self, status_set, patient_data):
        """Cleans the status set data set."""

28
    @abstractmethod
thecml's avatar
thecml committed
29
30
31
    def clean_training_done(self, training_done, patient_data):
        """Cleans the training done data set."""

32
    @abstractmethod
thecml's avatar
thecml committed
33
34
    def clean_training_cancelled(self, training_cancelled, patient_data):
        """Cleans the training cancelled data set."""
35

36
    @abstractmethod
thecml's avatar
thecml committed
37
38
39
    def clean_assistive_aids(self, ats, iso_classes):
        """Cleans the assistive aids data set."""

40
41
    def remove_citizens_not_in_patient_data(self, train_data: pd.DataFrame,
                                            patient_data: pd.DataFrame,
thecml's avatar
thecml committed
42
43
                                            id_col: str) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
44
        This method removes citizens not in patient data set.
thecml's avatar
thecml committed
45
46
47
48
49
50
        :param train_data: DigiRehab training data
        :param patient_data: DigiRehab patient data
        :param id_col: the name of the column identifing a citizen
        :return: cleaned dataframe
        """
        data = train_data[train_data[id_col].isin(patient_data[id_col].unique())]
51
52
        return data

53
    def remove_citizens_without_valid_id(self, df: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
54
        """
thecml's avatar
thecml committed
55
        This method removes citizens without a valid id.
thecml's avatar
thecml committed
56
57
58
        :param df: a dataframe
        :return: cleaned dataframe
        """
59
60
61
62
63
64
65
        df = df[df['CitizenId'] != "0000000000"]
        df = df[df['CitizenId'] != '0']
        df = df[df['CitizenId'] != "#VALUE!"]
        df = df[df['CitizenId'] != '681']
        df = df.dropna(subset=['CitizenId'])
        return df

66
    def merge_train_and_patient_data(self, train_data: pd.DataFrame,
thecml's avatar
thecml committed
67
68
                                     patient_data: pd.DataFrame,
                                     id_col: str) -> pd.DataFrame:
thecml's avatar
thecml committed
69
        """
thecml's avatar
thecml committed
70
        This method merges the training and patient data.
thecml's avatar
thecml committed
71
72
73
74
75
76
        :param train_data: DigiRehab training data
        :param patient_data: DigiRehab patient data
        :param id_col: the name of the column identifing a citizen
        :return: merged dataframe
        """
        return pd.merge(train_data, patient_data, on=id_col)
77

thecml's avatar
thecml committed
78
    def sort_dataframe(self, df: pd.DataFrame, col_name: str) -> pd.DataFrame:
thecml's avatar
thecml committed
79
        """
thecml's avatar
thecml committed
80
        This method sorts a dataframe based on a column name.
thecml's avatar
thecml committed
81
        :param df: dataframe to be sorted
thecml's avatar
thecml committed
82
        :param col_name: column name to sort by
thecml's avatar
thecml committed
83
84
        :return: sorted dataframe
        """
thecml's avatar
thecml committed
85
        return df.sort_values(col_name)
86

thecml's avatar
thecml committed
87
88
    def filter_ats_on_ids(self, df: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
89
        This method filters a dataframe containing ats.
thecml's avatar
thecml committed
90
91
92
93
94
95
        data by a list of ids in the dataframe
        :param df: dataframe containing ats
        :param ids: ids to filter by
        :return: filtered dataframe
        """
        return df[df['CitizenId'].isin(ids)]
96

thecml's avatar
thecml committed
97
98
    def remove_tainted_histories(self, df: pd.DataFrame) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
99
        This method removed tainted loan histories.
thecml's avatar
thecml committed
100
101
102
103
104
105
        :param df: dataframe containing ats
        :return: a cleaned dataframe
        """
        tained_ids = df[df['DevHMINumber'] == '899,999']['CitizenId'].unique()
        df = df[np.logical_not(df['CitizenId'].isin(tained_ids))]
        return df
106

thecml's avatar
thecml committed
107
108
    def remove_deprecated_device_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
109
        This method removed deprecated device data.
thecml's avatar
thecml committed
110
111
112
113
        :param df: dataframe containing ats
        :return: a cleaned dataframe
        """
        return df[df['DevHMINumber'] != '899,999']
114

thecml's avatar
thecml committed
115
116
    def remove_rows_with_old_dates(self, df: pd.DataFrame, date_col: str) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
117
        This method removes rows with old dates.
thecml's avatar
thecml committed
118
119
120
121
122
123
124
        :param df: dataframe containing ats
        :param date_col: the name of the date column
        :return: a cleaned dataframe
        """
        df[date_col] = pd.to_datetime(df[date_col])
        mask = (df[date_col] >= '1900-01-01') & (df[date_col] <= pd.Timestamp('today'))
        return df.loc[mask]
thecml's avatar
thecml committed
125

thecml's avatar
thecml committed
126
127
    def drop_invalid_devices(self, df: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
128
        This method removes invalid devices not in iso classes.
thecml's avatar
thecml committed
129
130
131
132
133
        :param df: dataframe containing ats
        :param iso_classes: dataframe with the iso classes
        :return: a cleaned dataframe
        """
        return df[df['DevISOClass'].isin(iso_classes.DevISOClass)]
134

thecml's avatar
thecml committed
135
136
    def remove_screenings_without_exercises(self, df: pd.DataFrame) -> pd.DataFrame:
        """
thecml's avatar
thecml committed
137
        This method removes screenings without exercises.
thecml's avatar
thecml committed
138
139
140
        :param df: dataframe containing screenings
        :return: a cleaned dataframe
        """
141
142
        df = df[df['ExerciseContent'] != 'nan']
        return df
143

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
class Cleaner2021(BaseCleaner):
    """Cleaner for 2021 dataset"""
    def clean_patient_data(self, patient_data: pd.DataFrame) -> pd.DataFrame:
        patient_data = self.remove_citizens_without_valid_id(patient_data)
        return patient_data

    def clean_screening_content(self, screening_content: pd.DataFrame,
                                patient_data: pd.DataFrame) -> pd.DataFrame:
        screening_content = self.remove_citizens_not_in_patient_data(screening_content,
                                                                     patient_data, 'CitizenId')
        screening_content = self.sort_dataframe(screening_content,
                                                ['CitizenId', 'ScreeningDate'])
        return screening_content

    def clean_status_set(self, status_set: pd.DataFrame,
                         patient_data: pd.DataFrame) -> pd.DataFrame:
        status_set = self.remove_citizens_not_in_patient_data(status_set,
                                                              patient_data, 'CitizenId')
        status_set = self.sort_dataframe(status_set,
                                         ['CitizenId', 'ChangeDate'])
        return status_set

    def clean_training_done(self, training_done: pd.DataFrame,
                            patient_data: pd.DataFrame) -> pd.DataFrame:
        training_done = self.remove_citizens_not_in_patient_data(training_done,
                                                                 patient_data, 'CitizenId')
        training_done = self.sort_dataframe(training_done,
                                            ['CitizenId', 'RatingDate'])
        return training_done

    def clean_training_cancelled(self, training_cancelled: pd.DataFrame,
                                 patient_data: pd.DataFrame) -> pd.DataFrame:
        training_cancelled = self.remove_citizens_not_in_patient_data(training_cancelled,
                                                                      patient_data, 'CitizenId')
        training_cancelled = self.sort_dataframe(training_cancelled,
                                                 ['CitizenId', 'RatingDate'])
        return training_cancelled

    def clean_assistive_aids(self, ats: pd.DataFrame,
                             iso_classes: pd.DataFrame) -> pd.DataFrame:
        ats = self.sort_dataframe(ats, ['CitizenId', 'LendDate'])
        ats = self.remove_citizens_without_valid_id(ats)
        ats = self.remove_rows_with_old_dates(ats, 'LendDate')
        ats = self.drop_invalid_devices(ats, iso_classes)
        return ats

class Cleaner2020(BaseCleaner):
thecml's avatar
thecml committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
    """Cleaner for 2020 dataset"""
    def clean_patient_data(self, patient_data: pd.DataFrame) -> pd.DataFrame:
        patient_data = self.remove_citizens_without_valid_id(patient_data)
        return patient_data

    def clean_screening_content(self, screening_content: pd.DataFrame,
                                patient_data: pd.DataFrame) -> pd.DataFrame:
        screening_content = self.remove_citizens_not_in_patient_data(screening_content,
                                                                     patient_data, 'CitizenId')
        screening_content = self.merge_train_and_patient_data(screening_content,
                                                              patient_data, 'CitizenId')
        screening_content = self.sort_dataframe(screening_content,
                                                ['CitizenId', 'ScreeningDate'])
        return screening_content

    def clean_status_set(self, status_set: pd.DataFrame,
                         patient_data: pd.DataFrame) -> pd.DataFrame:
        status_set = self.remove_citizens_not_in_patient_data(status_set,
                                                              patient_data, 'CitizenId')
        status_set = self.merge_train_and_patient_data(status_set,
                                                       patient_data, 'CitizenId')
        status_set = self.sort_dataframe(status_set,
                                         ['CitizenId', 'ChangeDate'])
        return status_set

    def clean_training_done(self, training_done: pd.DataFrame,
                            patient_data: pd.DataFrame) -> pd.DataFrame:
        training_done = self.remove_citizens_not_in_patient_data(training_done,
                                                                 patient_data, 'CitizenId')
        training_done = self.sort_dataframe(training_done,
                                            ['CitizenId', 'RatingDate'])
        return training_done

    def clean_training_cancelled(self, training_cancelled: pd.DataFrame,
                                 patient_data: pd.DataFrame) -> pd.DataFrame:
        training_cancelled = self.remove_citizens_not_in_patient_data(training_cancelled,
                                                                      patient_data, 'CitizenId')
        training_cancelled = self.merge_train_and_patient_data(training_cancelled,
                                                               patient_data, 'CitizenId')
        training_cancelled = self.sort_dataframe(training_cancelled,
                                                 ['CitizenId', 'RatingDate'])
        return training_cancelled

    def clean_assistive_aids(self, ats: pd.DataFrame,
                             iso_classes: pd.DataFrame) -> pd.DataFrame:
236
237
238
239
240
        ats = self.sort_dataframe(ats, ['CitizenId', 'LendDate'])
        ats = self.remove_citizens_without_valid_id(ats)
        ats = self.remove_rows_with_old_dates(ats, 'LendDate')
        ats = self.remove_deprecated_device_data(ats)
        ats = self.remove_tainted_histories(ats)
thecml's avatar
thecml committed
241
242
        ats = self.drop_invalid_devices(ats, iso_classes)
        return ats