Source code for tools.inputter
"""
inputter.py
====================================
Module for to create features based on screenings.
"""
from typing import Tuple
import pandas as pd
import numpy as np
from pandas._libs.tslibs import Timestamp
from utility.data import Data
[docs]def get_ats(df: pd.DataFrame, end_date: Timestamp, settings: dict) -> str:
"""
This method extracts a citizen's ats from a screening
:param df: a dataframe containing ats
:param end_date: the screening end date
:param settings: the settings to use
:return: the citizen's ats
"""
screening_ats = ','.join([str(elem)[:settings['ats_iso_length']]
for elem in df.DevISOClass[df.LendDate <= end_date]])
if screening_ats:
return screening_ats
return '0'
[docs]def get_number_ats(df: pd.DataFrame, end_date: Timestamp) -> int:
"""
This method extracts the number of ats a citizen have
:param df: a dataframe containing ats
:param end_date: the screening end date
:return: the number of ats
"""
number_ats = df.where(df.LendDate <= end_date).groupby(['CitizenId'])['DevISOClass'].count()
if number_ats.any():
return number_ats[0]
return 0
[docs]def get_avg_loan_period(df: pd.DataFrame, end_date: Timestamp) -> int:
"""
This method extracts the average ats loan period
:param df: a dataframe containing ats
:param end_date: the screening end date
:return: the average loan period
"""
number_ats = df.where(df.LendDate <= end_date).groupby(['CitizenId'])['DevISOClass'].count()
if number_ats.any():
lend_ats = df.where(df.LendDate <= end_date)['LendDate']
diff = lend_ats - end_date
days = abs(diff.mean().total_seconds()) // (24 * 3600)
return int(days)
return 0
[docs]def get_number_exercises(sc: Tuple) -> int:
"""
This method extracts the number of exercises in a program
:param sc: a tuple with a screening
:return: the number of exercises
"""
return len([str(elem)
for elem in sc.ExerciseContent.split(',') if elem != '0'])
[docs]def get_birth_year(sc: pd.DataFrame) -> int:
"""
This method extracts a citizen's birth year
:param sc: a tuple with a screening
:return: the birth year
"""
return int(sc['BirthYear'].iloc[0])
[docs]def get_gender(sc: pd.DataFrame) -> int:
"""
This method extracts a citizen's gender
:param sc: a tuple with a screening
:return: the gender
"""
if sc['Gender'].iloc[0] == 'FEMALE':
return 0
return 1
[docs]def get_citizen_data(data: Data, citizen_id: str) -> Data:
"""
This method extracts all screening data we have on a citizen
:param data: a data DTO consisting of screening data of all citizens
:param id: id of the citizen
:return: a data DTO with only the citizen's data in it
"""
screening_content = data.sc.loc[data.sc['CitizenId'] == str(citizen_id)]
status_set = data.ss.loc[data.ss['CitizenId'] == str(citizen_id)]
training_done = data.td.loc[data.td['CitizenId'] == str(citizen_id)]
training_cancelled = data.tc.loc[data.tc['CitizenId'] == str(citizen_id)]
ats = data.ats.loc[data.ats['CitizenId'] == str(citizen_id)]
citizen_data = Data(screening_content, status_set,
training_done, training_cancelled, ats)
return citizen_data
[docs]def get_screening_data(td: pd.DataFrame, tc: pd.DataFrame,
ss: pd.DataFrame, start_date: Timestamp,
end_date: Timestamp) -> Tuple[pd.DataFrame,
pd.DataFrame,
pd.DataFrame]:
"""
This method extracts screening data by a start and end date
:param td: training data
:param tc: training cancellations
:param ss: status data
:param start_date: start date of the screening
:param end_date: end date of the screening
:return: tuple with the windowed training data, training cancellations and status data
"""
tdw = td.loc[(td['RatingDate'] <= end_date)
& (td['RatingDate'] >= start_date)]
tcw = tc.loc[(tc['RatingDate'] <= end_date)
& (tc['RatingDate'] >= start_date)]
ssw = ss.loc[(ss['ChangeDate'] <= end_date)
& (ss['ChangeDate'] >= start_date)]
return tdw, tcw, ssw
[docs]def convert_date_to_datetime(date: Timestamp,
date_format: str) -> Timestamp:
"""
This method converts a date to timedate
:param date: date to convert
:param date_format: date format to use
:return: the converted date
"""
return pd.to_datetime(date, format=date_format)
[docs]def get_cancels_week(tcw: pd.DataFrame) -> pd.Series:
"""
This method extracts the cancellations per week
:param tcw: windowed training cancellations
:return: cancellations per week
"""
return tcw['RatingDate'].apply(lambda x: f"{x.week}/{x.year}")
[docs]def get_interval_length(start_date: Timestamp,
end_date: Timestamp) -> float:
"""
This method extracts the interval length between
a start date of the screening and end date
:param start_date: the start date
:param end_date: the end date
:return: the length of the interval
"""
return np.around((end_date - start_date).days / 7, decimals=2)
[docs]def get_needs(sc: Tuple) -> int:
"""
This method extracts a screning's need for help score
:param sc: a tuple with a screening
:return: the need for help score
"""
return sc.NeedForHelpScore
[docs]def get_physics(sc: Tuple) -> int:
"""
This method extracts a screning's physics score
:param sc: a tuple with a screening
:return: the physical strength score
"""
return sc.PhysicalStrengthScore
[docs]def get_needs_reason(sc: Tuple) -> str:
"""
This method extracts a screning's need for help reason
:param sc: a tuple with a screening
:return: the need for help reason
"""
needs_reason = sc.NeedForHelpReason
if not needs_reason == 'nan':
return needs_reason
else:
return 'Ingen forklaring'
[docs]def get_physics_reason(sc: Tuple) -> str:
"""
This method extracts a screning's physics reason
:param sc: a tuple with a screening
:return: the physical strength reason
"""
physics_reason = sc.PhysicalStrengthReason
if not physics_reason == 'nan':
return physics_reason
else:
return 'Ingen forklaring'
[docs]def get_exercise_content(sc: Tuple) -> str:
"""
This method extracts the exercise content
from a screening
:param sc: a tuple with a screening
:return: the exercise content
"""
return sc.ExerciseContent
[docs]def get_n_cancel_week_min(cancelsprweek: pd.Series) -> int:
"""
This method extracts a citizen's number of cancels
per week
:param cancelsprweek: a series with cancels per week
:return: number of cancels per week
"""
return cancelsprweek.value_counts().min() if not cancelsprweek.empty else 0
[docs]def get_n_weeks_without_training(n_weeks: float,
n_weeks_with_trainings: int) -> int:
"""
This method extracts a citizen's number
of weeks without training
:param n_weeks: number of screening weeks
:param n_weeks_with_trainings: number of training weeks
:return: number of weeks without training
"""
return max(0, (np.ceil(n_weeks) - n_weeks_with_trainings))
[docs]def get_n_weeks_with_training(tdw: pd.DataFrame,
start_date: Timestamp) -> int:
"""
This method extracts a citizen's number
of weeks with training
:param n_weeks: number of screening weeks
:param n_weeks_with_trainings: number of training weeks
:return: number of weeks with training
"""
return tdw['RatingDate'].apply(lambda x: np.floor((x - start_date).days / 7)).nunique()
[docs]def get_training_week(tdw: pd.DataFrame,
start_date: Timestamp) -> pd.Series:
"""
This method extracts the training
a citizen as done per week
:param tdw: the windowed training data
:param start_date: the start date
:return: citizen's training per week
"""
if not tdw.empty:
return tdw['RatingDate'].apply(lambda x: np.floor((x - start_date).days / 7))
return pd.Series([])
[docs]def get_n_training_week_max(training_pr_week: pd.Series) -> int:
"""
This method extracts the largest number
of trainings a citizen has done per week
:param training_pr_week: trainings per week
:return: largest number of tranings done per week
"""
if not training_pr_week.empty:
return training_pr_week.value_counts().max()
else:
return 0
[docs]def get_n_training_week_min(training_pr_week: pd.Series,
n_weeks_with_trainings: int,
n_weeks: float) -> int:
"""
This method extracts the smallest number
of trainings a citizen has done per week
:param training_pr_week: trainings per week
:param n_weeks_with_trainings: number of weeks with training
:param n_weeks: number of screening weeks
:return: smallest number of tranings done per week
"""
if not training_pr_week.empty and n_weeks_with_trainings > n_weeks:
return training_pr_week.value_counts().min()
else:
return 0
[docs]def get_n_training_week(n_weeks: float,
n_training_window: int) -> int:
"""
This method extracts the number of
completed trainings per week
:param n_weeks: number of screening weeks
:param n_training_window: length of training window
:return: number of trainings per week
"""
if n_training_window is not None and n_weeks is not None:
n_trainings_week = np.around(float(n_training_window)
/ n_weeks, decimals=1) if n_weeks else 0
if not pd.isnull(n_trainings_week):
return n_trainings_week
else:
return 0
else:
return 0
[docs]def get_n_training_window(tdw: pd.DataFrame) -> int:
"""
This method extracts the number of training windows
:param tdw: the windowed training data
:return: number of training windows
"""
n_training_window = tdw.shape[0] if not tdw.empty else 0
return n_training_window
[docs]def get_max_evaluation(tdw: pd.DataFrame) -> int:
"""
This method extracts the largest
evaluation score a citizen has gotten
:param tdw: the windowed training data
:return: citizen's largest evaluation score
"""
max_evaluation = np.around(tdw['RatingScore'].max(axis=0), decimals=1)
if not pd.isnull(max_evaluation):
return max_evaluation
else:
return 0
[docs]def get_min_evaluation(tdw: pd.DataFrame) -> int:
"""
This method extracts the smallest
evaluation score a citizen has gotten
:param tdw: the windowed training data
:return: citizen's smallest evaluation score
"""
min_evaluation = np.around(tdw['RatingScore'].min(axis=0), decimals=1)
if not pd.isnull(min_evaluation):
return min_evaluation
else:
return 0
[docs]def get_std_evaluation(tdw: pd.DataFrame) -> float:
"""
This method extracts the standard deviation
of citizen's evaluation score
:param tdw: the windowed training data
:return: standard deviation of citizen's evaluation score
"""
std_evaluation = np.around(tdw['RatingScore'].std(axis=0), decimals=1)
if not pd.isnull(std_evaluation):
return std_evaluation
else:
return 0.0
[docs]def get_mean_evaluation(tdw: pd.DataFrame) -> float:
"""
This method extracts the mean
of citizen's evaluation score
:param tdw: the windowed training data
:return: citizen's mean evaluation score
"""
mean_evaluation = np.around(tdw['RatingScore'].mean(axis=0), decimals=1)
if not pd.isnull(mean_evaluation):
return mean_evaluation
else:
return 0.0
[docs]def get_time_between_training_mean(tdw: pd.DataFrame,
n_decimals:int=2) -> float:
"""
This method extracts the mean time between trainings
:param tdw: the windowed training data
:param n_decimals: number of decimals for rounding
:return: citizen's mean time between training
"""
time_between_trainings_dif = tdw['RatingDate'].diff().apply(lambda x:
x.days) if not tdw.empty else None
if time_between_trainings_dif is not None:
if not time_between_trainings_dif.empty:
time_mean = np.round(time_between_trainings_dif.iloc[1:].mean(), n_decimals)
if not pd.isnull(time_mean):
return time_mean
else:
return 0.0
else:
return 0.0
else:
return 0.0
[docs]def get_mean_time_between_cancels(tcw: pd.DataFrame, n_decimals=2) -> float:
"""
This method extracts the mean time between cancellations
:param tcw: the windowed cancellation data
:param n_decimals: number of decimals for rounding
:return: citizen's mean time between cancellations
"""
if not tcw.empty:
time_between_cancels_diff = tcw['RatingDate'].diff().apply(lambda x: x.days)
if time_between_cancels_diff is not None:
if tcw.shape[0] > 1:
mean_time = np.round(time_between_cancels_diff.iloc[1:].mean(), n_decimals)
return mean_time
else:
return 0.0
else:
return 0.0
return 0.0
[docs]def get_mean_cancels_week(n_cancel: int, n_weeks: float) -> int:
"""
This method extracts the mean number
of cancellations per week
:param n_cancel: number of cancellations
:param n_weeks: number of screening weeks
:return: citizen's mean number of cancellations per week
"""
if n_cancel is not None and n_weeks is not None:
mean_cancels = round(float(n_cancel) / n_weeks, 2) if n_weeks else 0
return mean_cancels
else:
return 0