Source code for tools.inputter

"""
inputter.py
====================================
Module for to create features based on screenings.
"""

from typing import Tuple
import pandas as pd
import numpy as np
from pandas._libs.tslibs import Timestamp
from utility.data import Data

[docs]def get_ats(df: pd.DataFrame, end_date: Timestamp, settings: dict) -> str: """ This method extracts a citizen's ats from a screening :param df: a dataframe containing ats :param end_date: the screening end date :param settings: the settings to use :return: the citizen's ats """ screening_ats = ','.join([str(elem)[:settings['ats_iso_length']] for elem in df.DevISOClass[df.LendDate <= end_date]]) if screening_ats: return screening_ats return '0'
[docs]def get_number_ats(df: pd.DataFrame, end_date: Timestamp) -> int: """ This method extracts the number of ats a citizen have :param df: a dataframe containing ats :param end_date: the screening end date :return: the number of ats """ number_ats = df.where(df.LendDate <= end_date).groupby(['CitizenId'])['DevISOClass'].count() if number_ats.any(): return number_ats[0] return 0
[docs]def get_avg_loan_period(df: pd.DataFrame, end_date: Timestamp) -> int: """ This method extracts the average ats loan period :param df: a dataframe containing ats :param end_date: the screening end date :return: the average loan period """ number_ats = df.where(df.LendDate <= end_date).groupby(['CitizenId'])['DevISOClass'].count() if number_ats.any(): lend_ats = df.where(df.LendDate <= end_date)['LendDate'] diff = lend_ats - end_date days = abs(diff.mean().total_seconds()) // (24 * 3600) return int(days) return 0
[docs]def get_number_exercises(sc: Tuple) -> int: """ This method extracts the number of exercises in a program :param sc: a tuple with a screening :return: the number of exercises """ return len([str(elem) for elem in sc.ExerciseContent.split(',') if elem != '0'])
[docs]def get_birth_year(sc: pd.DataFrame) -> int: """ This method extracts a citizen's birth year :param sc: a tuple with a screening :return: the birth year """ return int(sc['BirthYear'].iloc[0])
[docs]def get_gender(sc: pd.DataFrame) -> int: """ This method extracts a citizen's gender :param sc: a tuple with a screening :return: the gender """ if sc['Gender'].iloc[0] == 'FEMALE': return 0 return 1
[docs]def get_citizen_data(data: Data, citizen_id: str) -> Data: """ This method extracts all screening data we have on a citizen :param data: a data DTO consisting of screening data of all citizens :param id: id of the citizen :return: a data DTO with only the citizen's data in it """ screening_content = data.sc.loc[data.sc['CitizenId'] == str(citizen_id)] status_set = data.ss.loc[data.ss['CitizenId'] == str(citizen_id)] training_done = data.td.loc[data.td['CitizenId'] == str(citizen_id)] training_cancelled = data.tc.loc[data.tc['CitizenId'] == str(citizen_id)] ats = data.ats.loc[data.ats['CitizenId'] == str(citizen_id)] citizen_data = Data(screening_content, status_set, training_done, training_cancelled, ats) return citizen_data
[docs]def get_screening_data(td: pd.DataFrame, tc: pd.DataFrame, ss: pd.DataFrame, start_date: Timestamp, end_date: Timestamp) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ This method extracts screening data by a start and end date :param td: training data :param tc: training cancellations :param ss: status data :param start_date: start date of the screening :param end_date: end date of the screening :return: tuple with the windowed training data, training cancellations and status data """ tdw = td.loc[(td['RatingDate'] <= end_date) & (td['RatingDate'] >= start_date)] tcw = tc.loc[(tc['RatingDate'] <= end_date) & (tc['RatingDate'] >= start_date)] ssw = ss.loc[(ss['ChangeDate'] <= end_date) & (ss['ChangeDate'] >= start_date)] return tdw, tcw, ssw
[docs]def convert_date_to_datetime(date: Timestamp, date_format: str) -> Timestamp: """ This method converts a date to timedate :param date: date to convert :param date_format: date format to use :return: the converted date """ return pd.to_datetime(date, format=date_format)
[docs]def get_cancels_week(tcw: pd.DataFrame) -> pd.Series: """ This method extracts the cancellations per week :param tcw: windowed training cancellations :return: cancellations per week """ return tcw['RatingDate'].apply(lambda x: f"{x.week}/{x.year}")
[docs]def get_interval_length(start_date: Timestamp, end_date: Timestamp) -> float: """ This method extracts the interval length between a start date of the screening and end date :param start_date: the start date :param end_date: the end date :return: the length of the interval """ return np.around((end_date - start_date).days / 7, decimals=2)
[docs]def get_needs(sc: Tuple) -> int: """ This method extracts a screning's need for help score :param sc: a tuple with a screening :return: the need for help score """ return sc.NeedForHelpScore
[docs]def get_physics(sc: Tuple) -> int: """ This method extracts a screning's physics score :param sc: a tuple with a screening :return: the physical strength score """ return sc.PhysicalStrengthScore
[docs]def get_needs_reason(sc: Tuple) -> str: """ This method extracts a screning's need for help reason :param sc: a tuple with a screening :return: the need for help reason """ needs_reason = sc.NeedForHelpReason if not needs_reason == 'nan': return needs_reason else: return 'Ingen forklaring'
[docs]def get_physics_reason(sc: Tuple) -> str: """ This method extracts a screning's physics reason :param sc: a tuple with a screening :return: the physical strength reason """ physics_reason = sc.PhysicalStrengthReason if not physics_reason == 'nan': return physics_reason else: return 'Ingen forklaring'
[docs]def get_exercise_content(sc: Tuple) -> str: """ This method extracts the exercise content from a screening :param sc: a tuple with a screening :return: the exercise content """ return sc.ExerciseContent
[docs]def get_n_cancel_week_min(cancelsprweek: pd.Series) -> int: """ This method extracts a citizen's number of cancels per week :param cancelsprweek: a series with cancels per week :return: number of cancels per week """ return cancelsprweek.value_counts().min() if not cancelsprweek.empty else 0
[docs]def get_n_weeks_without_training(n_weeks: float, n_weeks_with_trainings: int) -> int: """ This method extracts a citizen's number of weeks without training :param n_weeks: number of screening weeks :param n_weeks_with_trainings: number of training weeks :return: number of weeks without training """ return max(0, (np.ceil(n_weeks) - n_weeks_with_trainings))
[docs]def get_n_weeks_with_training(tdw: pd.DataFrame, start_date: Timestamp) -> int: """ This method extracts a citizen's number of weeks with training :param n_weeks: number of screening weeks :param n_weeks_with_trainings: number of training weeks :return: number of weeks with training """ return tdw['RatingDate'].apply(lambda x: np.floor((x - start_date).days / 7)).nunique()
[docs]def get_training_week(tdw: pd.DataFrame, start_date: Timestamp) -> pd.Series: """ This method extracts the training a citizen as done per week :param tdw: the windowed training data :param start_date: the start date :return: citizen's training per week """ if not tdw.empty: return tdw['RatingDate'].apply(lambda x: np.floor((x - start_date).days / 7)) return pd.Series([])
[docs]def get_n_training_week_max(training_pr_week: pd.Series) -> int: """ This method extracts the largest number of trainings a citizen has done per week :param training_pr_week: trainings per week :return: largest number of tranings done per week """ if not training_pr_week.empty: return training_pr_week.value_counts().max() else: return 0
[docs]def get_n_training_week_min(training_pr_week: pd.Series, n_weeks_with_trainings: int, n_weeks: float) -> int: """ This method extracts the smallest number of trainings a citizen has done per week :param training_pr_week: trainings per week :param n_weeks_with_trainings: number of weeks with training :param n_weeks: number of screening weeks :return: smallest number of tranings done per week """ if not training_pr_week.empty and n_weeks_with_trainings > n_weeks: return training_pr_week.value_counts().min() else: return 0
[docs]def get_n_training_week(n_weeks: float, n_training_window: int) -> int: """ This method extracts the number of completed trainings per week :param n_weeks: number of screening weeks :param n_training_window: length of training window :return: number of trainings per week """ if n_training_window is not None and n_weeks is not None: n_trainings_week = np.around(float(n_training_window) / n_weeks, decimals=1) if n_weeks else 0 if not pd.isnull(n_trainings_week): return n_trainings_week else: return 0 else: return 0
[docs]def get_n_training_window(tdw: pd.DataFrame) -> int: """ This method extracts the number of training windows :param tdw: the windowed training data :return: number of training windows """ n_training_window = tdw.shape[0] if not tdw.empty else 0 return n_training_window
[docs]def get_max_evaluation(tdw: pd.DataFrame) -> int: """ This method extracts the largest evaluation score a citizen has gotten :param tdw: the windowed training data :return: citizen's largest evaluation score """ max_evaluation = np.around(tdw['RatingScore'].max(axis=0), decimals=1) if not pd.isnull(max_evaluation): return max_evaluation else: return 0
[docs]def get_min_evaluation(tdw: pd.DataFrame) -> int: """ This method extracts the smallest evaluation score a citizen has gotten :param tdw: the windowed training data :return: citizen's smallest evaluation score """ min_evaluation = np.around(tdw['RatingScore'].min(axis=0), decimals=1) if not pd.isnull(min_evaluation): return min_evaluation else: return 0
[docs]def get_std_evaluation(tdw: pd.DataFrame) -> float: """ This method extracts the standard deviation of citizen's evaluation score :param tdw: the windowed training data :return: standard deviation of citizen's evaluation score """ std_evaluation = np.around(tdw['RatingScore'].std(axis=0), decimals=1) if not pd.isnull(std_evaluation): return std_evaluation else: return 0.0
[docs]def get_mean_evaluation(tdw: pd.DataFrame) -> float: """ This method extracts the mean of citizen's evaluation score :param tdw: the windowed training data :return: citizen's mean evaluation score """ mean_evaluation = np.around(tdw['RatingScore'].mean(axis=0), decimals=1) if not pd.isnull(mean_evaluation): return mean_evaluation else: return 0.0
[docs]def get_time_between_training_mean(tdw: pd.DataFrame, n_decimals:int=2) -> float: """ This method extracts the mean time between trainings :param tdw: the windowed training data :param n_decimals: number of decimals for rounding :return: citizen's mean time between training """ time_between_trainings_dif = tdw['RatingDate'].diff().apply(lambda x: x.days) if not tdw.empty else None if time_between_trainings_dif is not None: if not time_between_trainings_dif.empty: time_mean = np.round(time_between_trainings_dif.iloc[1:].mean(), n_decimals) if not pd.isnull(time_mean): return time_mean else: return 0.0 else: return 0.0 else: return 0.0
[docs]def get_mean_time_between_cancels(tcw: pd.DataFrame, n_decimals=2) -> float: """ This method extracts the mean time between cancellations :param tcw: the windowed cancellation data :param n_decimals: number of decimals for rounding :return: citizen's mean time between cancellations """ if not tcw.empty: time_between_cancels_diff = tcw['RatingDate'].diff().apply(lambda x: x.days) if time_between_cancels_diff is not None: if tcw.shape[0] > 1: mean_time = np.round(time_between_cancels_diff.iloc[1:].mean(), n_decimals) return mean_time else: return 0.0 else: return 0.0 return 0.0
[docs]def get_mean_cancels_week(n_cancel: int, n_weeks: float) -> int: """ This method extracts the mean number of cancellations per week :param n_cancel: number of cancellations :param n_weeks: number of screening weeks :return: citizen's mean number of cancellations per week """ if n_cancel is not None and n_weeks is not None: mean_cancels = round(float(n_cancel) / n_weeks, 2) if n_weeks else 0 return mean_cancels else: return 0