cleaner.py 4.92 KB
Newer Older
1
2
3
import pandas as pd
import numpy as np
import os
4
from abc import ABC, abstractmethod
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
5
from typing import List
6

7
8
class BaseCleaner(ABC):
    @abstractmethod
9
10
11
    def clean_clusters(self, cl):
        """Cleans the cluster data set"""
    
12
    @abstractmethod
13
14
15
    def clean_patient_data(self, ptd):
        """Cleans the patient data set"""
    
16
    @abstractmethod
17
18
19
    def clean_screening_content(self, sc, ptd):
        """Cleans the screening content data set"""
        
20
    @abstractmethod
21
22
23
    def clean_status_set(self, ss, ptd):
        """Cleans the status set data set"""
    
24
    @abstractmethod
25
26
27
    def clean_training_done(self, td, ptd):
        """Cleans the training done data set"""
        
28
    @abstractmethod
29
30
31
    def clean_training_cancelled(self, tc, ptd):
        """Cleans the training cancelled data set"""    

32
    @abstractmethod
33
34
35
    def clean_assistive_aids(self, ats, ic, ids):
        """Cleans the assistive aids data set"""
        
36
    @abstractmethod
37
38
39
40
    def clean_fall_data(self, fd):
        """Cleans the fall set"""

class Cleaner2020(BaseCleaner):
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
41
    def clean_clusters(self, cl: pd.DataFrame) -> pd.DataFrame:
42
43
        return cl
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
44
    def clean_patient_data(self, ptd: pd.DataFrame) -> pd.DataFrame:
45
46
47
        ptd = remove_citizens_without_valid_id(ptd)
        return ptd
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
48
    def clean_screening_content(self, sc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
49
50
51
        sc = remove_citizens_not_in_patient_data(sc, ptd, 'CitizenId')
        sc = merge_train_and_patient_data(sc, ptd, 'CitizenId')
        sc = sort_dataframe(sc, ['CitizenId', 'ScreeningDate'])
52
53
        return sc
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
54
    def clean_status_set(self, ss: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
55
56
57
        ss = remove_citizens_not_in_patient_data(ss, ptd, 'CitizenId')
        ss = merge_train_and_patient_data(ss, ptd, 'CitizenId')
        ss = sort_dataframe(ss, ['CitizenId', 'ChangeDate'])
58
59
        return ss
        
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
60
    def clean_training_done(self, td: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
61
62
        td = remove_citizens_not_in_patient_data(td, ptd, 'CitizenId')
        td = sort_dataframe(td, ['CitizenId', 'RatingDate'])
63
64
        return td

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
65
    def clean_training_cancelled(self, tc: pd.DataFrame, ptd: pd.DataFrame) -> pd.DataFrame:
66
67
68
        tc = remove_citizens_not_in_patient_data(tc, ptd, 'CitizenId')
        tc = merge_train_and_patient_data(tc, ptd, 'CitizenId')
        tc = sort_dataframe(tc, ['CitizenId', 'RatingDate'])
69
70
        return tc
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
71
72
    def clean_assistive_aids(self, ats: pd.DataFrame, ic: pd.DataFrame,
                             ids: List[str]=None) -> pd.DataFrame:
73
        ats = sort_dataframe(ats, ['CitizenId', 'LendDate'])
74
        ats = remove_citizens_without_valid_id(ats)
75
        ats = remove_rows_with_old_dates(ats, 'LendDate')
76
77
        ats = remove_deprecated_device_data(ats)
        ats = remove_tainted_histories(ats)
thecml's avatar
thecml committed
78
        ats = drop_invalid_devices(ats, ic)
79
80
        return ats
    
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
81
    def clean_fall_data(self, fd: pd.DataFrame) -> pd.DataFrame:
82
        fd = remove_citizens_without_valid_id(fd)
83
        fd = sort_dataframe(fd, ['CitizenId', 'Date'])
84
85
        return fd
                
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
86
def drop_invalid_devices(ats: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
87
    return ats[ats['DevISOClass'].isin(iso_classes.DevISOClass)]
88

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
89
def remove_screenings_without_exercises(df: pd.DataFrame) -> pd.DataFrame:    
90
    df = df[df['ExerciseContent'] != 'nan']
91
92
    return df

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
93
94
95
def remove_citizens_not_in_patient_data(train_data: pd.DataFrame,
                                        patient_data: pd.DataFrame,
                                        id: str) -> pd.DataFrame:
96
97
98
    data = train_data[train_data[id].isin(patient_data[id].unique())]
    return data

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
99
def remove_citizens_without_valid_id(df: pd.DataFrame) -> pd.DataFrame:
100
101
102
103
    df = df[df['CitizenId'] != "0000000000"]
    df = df[df['CitizenId'] != '0']
    df = df[df['CitizenId'] != "#VALUE!"]
    df = df[df['CitizenId'] != '681']
104
    df = df.dropna(subset=['CitizenId'])
105
106
    return df

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
107
108
109
def merge_train_and_patient_data(train_data: pd.DataFrame,
                                 patient_data: pd.DataFrame,
                                 key: str) -> pd.DataFrame:
110
111
    return pd.merge(train_data, patient_data, on=key)

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
112
def sort_dataframe(data: pd.DataFrame, by: str) -> pd.DataFrame:
113
114
    return data.sort_values(by)

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
115
def filter_ats_on_ids(ats: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
116
    return ats[ats['CitizenId'].isin(ids)]
117

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
118
def remove_tainted_histories(ats: pd.DataFrame) -> pd.DataFrame:
119
120
    tained_ids = ats[ats['DevHMINumber'] == '899,999']['CitizenId'].unique()
    ats = ats[np.logical_not(ats['CitizenId'].isin(tained_ids))]
121
122
    return ats

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
123
def remove_deprecated_device_data(ats: pd.DataFrame) -> pd.DataFrame:
124
    return ats[ats['DevHMINumber'] != '899,999']
125

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
126
def remove_rows_with_old_dates(ats: pd.DataFrame, col: str) -> pd.DataFrame:
127
128
129
    ats[col] = pd.to_datetime(ats[col])
    mask = (ats[col] >= '1900-01-01') & (ats[col] <= pd.Timestamp('today'))
    return ats.loc[mask]