Source code for tools.cleaner

"""
cleaner.py
====================================
Module to clean raw data.
"""

from abc import ABC, abstractmethod
from typing import List
import pandas as pd
import numpy as np

[docs]class BaseCleaner(ABC): """ Base class for cleaners. """
[docs] @abstractmethod def clean_patient_data(self, patient_data): """Cleans the patient data set."""
[docs] @abstractmethod def clean_screening_content(self, screening_content, patient_data): """Cleans the screening content data set."""
[docs] @abstractmethod def clean_status_set(self, status_set, patient_data): """Cleans the status set data set."""
[docs] @abstractmethod def clean_training_done(self, training_done, patient_data): """Cleans the training done data set."""
[docs] @abstractmethod def clean_training_cancelled(self, training_cancelled, patient_data): """Cleans the training cancelled data set."""
[docs] @abstractmethod def clean_assistive_aids(self, ats, iso_classes): """Cleans the assistive aids data set."""
[docs]class Cleaner2021(BaseCleaner): """Cleaner for 2021 dataset"""
[docs] def clean_patient_data(self, df: pd.DataFrame) -> pd.DataFrame: df = df[df['CitizenId'] != "0000000000"] df = df[df['CitizenId'] != '0'] df = df[df['CitizenId'] != "#VALUE!"] df = df[df['CitizenId'] != '681'] df = df.dropna(subset=['CitizenId']) return df
[docs] def clean_screening_content(self, df: pd.DataFrame, patient_data: pd.DataFrame) -> pd.DataFrame: df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())] df = df.sort_values(['CitizenId', 'ScreeningDate']) return df
[docs] def clean_status_set(self, df: pd.DataFrame, patient_data: pd.DataFrame) -> pd.DataFrame: df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())] df = df.sort_values(['CitizenId', 'ChangeDate']) return df
[docs] def clean_training_done(self, df: pd.DataFrame, patient_data: pd.DataFrame) -> pd.DataFrame: df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())] df = df.sort_values(['CitizenId', 'RatingDate']) return df
[docs] def clean_training_cancelled(self, df: pd.DataFrame, patient_data: pd.DataFrame) -> pd.DataFrame: df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())] df = df.sort_values(['CitizenId', 'RatingDate']) return df
[docs] def clean_assistive_aids(self, df: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame: df = df.sort_values(['CitizenId', 'LendDate']) df = df[df['CitizenId'] != "0000000000"] df = df[df['CitizenId'] != '0'] df = df[df['CitizenId'] != "#VALUE!"] df = df[df['CitizenId'] != '681'] df = df.dropna(subset=['CitizenId']) df['LendDate'] = pd.to_datetime(df['LendDate']) mask = (df['LendDate'] >= '1900-01-01') & (df['LendDate'] <= pd.Timestamp('today')) df = df.loc[mask] df = df[df['DevISOClass'].isin(iso_classes.DevISOClass)] return df