cleaner.py 3.12 KB
Newer Older
thecml's avatar
thecml committed
1
2
3
4
5
6
"""
cleaner.py
====================================
Module to clean raw data.
"""

7
from abc import ABC, abstractmethod
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
8
from typing import List
thecml's avatar
thecml committed
9
10
import pandas as pd
import numpy as np
11

12
class BaseCleaner(ABC):
thecml's avatar
thecml committed
13
14
15
    """
    Base class for cleaners.
    """
16
    @abstractmethod
thecml's avatar
thecml committed
17
18
19
    def clean_patient_data(self, patient_data):
        """Cleans the patient data set."""

20
    @abstractmethod
thecml's avatar
thecml committed
21
22
23
    def clean_screening_content(self, screening_content, patient_data):
        """Cleans the screening content data set."""

24
    @abstractmethod
thecml's avatar
thecml committed
25
26
27
    def clean_status_set(self, status_set, patient_data):
        """Cleans the status set data set."""

28
    @abstractmethod
thecml's avatar
thecml committed
29
30
31
    def clean_training_done(self, training_done, patient_data):
        """Cleans the training done data set."""

32
    @abstractmethod
thecml's avatar
thecml committed
33
34
    def clean_training_cancelled(self, training_cancelled, patient_data):
        """Cleans the training cancelled data set."""
35

36
    @abstractmethod
thecml's avatar
thecml committed
37
38
39
    def clean_assistive_aids(self, ats, iso_classes):
        """Cleans the assistive aids data set."""

thecml's avatar
thecml committed
40
41
42
class Cleaner2021(BaseCleaner):
    """Cleaner for 2021 dataset"""
    def clean_patient_data(self, df: pd.DataFrame) -> pd.DataFrame:
43
44
45
46
47
48
49
        df = df[df['CitizenId'] != "0000000000"]
        df = df[df['CitizenId'] != '0']
        df = df[df['CitizenId'] != "#VALUE!"]
        df = df[df['CitizenId'] != '681']
        df = df.dropna(subset=['CitizenId'])
        return df

thecml's avatar
thecml committed
50
    def clean_screening_content(self, df: pd.DataFrame,
thecml's avatar
thecml committed
51
                                patient_data: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
52
53
54
        df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
        df = df.sort_values(['CitizenId', 'ScreeningDate'])
        return df
thecml's avatar
thecml committed
55

thecml's avatar
thecml committed
56
    def clean_status_set(self, df: pd.DataFrame,
thecml's avatar
thecml committed
57
                         patient_data: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
58
59
60
        df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
        df = df.sort_values(['CitizenId', 'ChangeDate'])
        return df
thecml's avatar
thecml committed
61

thecml's avatar
thecml committed
62
    def clean_training_done(self, df: pd.DataFrame,
thecml's avatar
thecml committed
63
                            patient_data: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
64
65
66
        df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
        df = df.sort_values(['CitizenId', 'RatingDate'])
        return df
thecml's avatar
thecml committed
67

thecml's avatar
thecml committed
68
    def clean_training_cancelled(self, df: pd.DataFrame,
thecml's avatar
thecml committed
69
                                 patient_data: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
70
71
72
        df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
        df = df.sort_values(['CitizenId', 'RatingDate'])
        return df
thecml's avatar
thecml committed
73

thecml's avatar
thecml committed
74
    def clean_assistive_aids(self, df: pd.DataFrame,
thecml's avatar
thecml committed
75
                             iso_classes: pd.DataFrame) -> pd.DataFrame:
thecml's avatar
thecml committed
76
77
78
79
80
81
82
83
84
85
        df = df.sort_values(['CitizenId', 'LendDate'])
        df = df[df['CitizenId'] != "0000000000"]
        df = df[df['CitizenId'] != '0']
        df = df[df['CitizenId'] != "#VALUE!"]
        df = df[df['CitizenId'] != '681']
        df = df.dropna(subset=['CitizenId'])
        df['LendDate'] = pd.to_datetime(df['LendDate'])
        mask = (df['LendDate'] >= '1900-01-01') & (df['LendDate'] <= pd.Timestamp('today'))
        df = df.loc[mask]
        df = df[df['DevISOClass'].isin(iso_classes.DevISOClass)]
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
86
        return df