Source code for tools.cleaner
"""
cleaner.py
====================================
Module to clean raw data.
"""
from abc import ABC, abstractmethod
from typing import List
import pandas as pd
import numpy as np
[docs]class BaseCleaner(ABC):
"""
Base class for cleaners.
"""
[docs] @abstractmethod
def clean_patient_data(self, patient_data):
"""Cleans the patient data set."""
[docs] @abstractmethod
def clean_screening_content(self, screening_content, patient_data):
"""Cleans the screening content data set."""
[docs] @abstractmethod
def clean_status_set(self, status_set, patient_data):
"""Cleans the status set data set."""
[docs] @abstractmethod
def clean_training_done(self, training_done, patient_data):
"""Cleans the training done data set."""
[docs] @abstractmethod
def clean_training_cancelled(self, training_cancelled, patient_data):
"""Cleans the training cancelled data set."""
[docs] @abstractmethod
def clean_assistive_aids(self, ats, iso_classes):
"""Cleans the assistive aids data set."""
[docs]class Cleaner2021(BaseCleaner):
"""Cleaner for 2021 dataset"""
[docs] def clean_patient_data(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'] != "0000000000"]
df = df[df['CitizenId'] != '0']
df = df[df['CitizenId'] != "#VALUE!"]
df = df[df['CitizenId'] != '681']
df = df.dropna(subset=['CitizenId'])
return df
[docs] def clean_screening_content(self, df: pd.DataFrame,
patient_data: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
df = df.sort_values(['CitizenId', 'ScreeningDate'])
return df
[docs] def clean_status_set(self, df: pd.DataFrame,
patient_data: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
df = df.sort_values(['CitizenId', 'ChangeDate'])
return df
[docs] def clean_training_done(self, df: pd.DataFrame,
patient_data: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
df = df.sort_values(['CitizenId', 'RatingDate'])
return df
[docs] def clean_training_cancelled(self, df: pd.DataFrame,
patient_data: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'].isin(patient_data['CitizenId'].unique())]
df = df.sort_values(['CitizenId', 'RatingDate'])
return df
[docs] def clean_assistive_aids(self, df: pd.DataFrame,
iso_classes: pd.DataFrame) -> pd.DataFrame:
df = df.sort_values(['CitizenId', 'LendDate'])
df = df[df['CitizenId'] != "0000000000"]
df = df[df['CitizenId'] != '0']
df = df[df['CitizenId'] != "#VALUE!"]
df = df[df['CitizenId'] != '681']
df = df.dropna(subset=['CitizenId'])
df['LendDate'] = pd.to_datetime(df['LendDate'])
mask = (df['LendDate'] >= '1900-01-01') & (df['LendDate'] <= pd.Timestamp('today'))
df = df.loc[mask]
df = df[df['DevISOClass'].isin(iso_classes.DevISOClass)]
return df