Skip to content
Snippets Groups Projects
Commit b12d63d8 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

moved some code to notebook, renamed code in tools

parent f2b8c184
No related branches found
No related tags found
No related merge requests found
Pipeline #67138 passed
This diff is collapsed.
......@@ -37,13 +37,13 @@ class BaseCleaner(ABC):
def clean_fall_data(self, fd):
"""Cleans the fall set"""
def remove_citizens_not_in_patient_data(train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id: str) -> pd.DataFrame:
def remove_citizens_not_in_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id: str) -> pd.DataFrame:
data = train_data[train_data[id].isin(patient_data[id].unique())]
return data
def remove_citizens_without_valid_id(df: pd.DataFrame) -> pd.DataFrame:
def remove_citizens_without_valid_id(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'] != "0000000000"]
df = df[df['CitizenId'] != '0']
df = df[df['CitizenId'] != "#VALUE!"]
......@@ -51,34 +51,34 @@ class BaseCleaner(ABC):
df = df.dropna(subset=['CitizenId'])
return df
def merge_train_and_patient_data(train_data: pd.DataFrame,
def merge_train_and_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
key: str) -> pd.DataFrame:
return pd.merge(train_data, patient_data, on=key)
def sort_dataframe(data: pd.DataFrame, by: str) -> pd.DataFrame:
def sort_dataframe(self, data: pd.DataFrame, by: str) -> pd.DataFrame:
return data.sort_values(by)
def filter_ats_on_ids(ats: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
def filter_ats_on_ids(self, ats: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
return ats[ats['CitizenId'].isin(ids)]
def remove_tainted_histories(ats: pd.DataFrame) -> pd.DataFrame:
def remove_tainted_histories(self, ats: pd.DataFrame) -> pd.DataFrame:
tained_ids = ats[ats['DevHMINumber'] == '899,999']['CitizenId'].unique()
ats = ats[np.logical_not(ats['CitizenId'].isin(tained_ids))]
return ats
def remove_deprecated_device_data(ats: pd.DataFrame) -> pd.DataFrame:
def remove_deprecated_device_data(self, ats: pd.DataFrame) -> pd.DataFrame:
return ats[ats['DevHMINumber'] != '899,999']
def remove_rows_with_old_dates(ats: pd.DataFrame, col: str) -> pd.DataFrame:
def remove_rows_with_old_dates(self, ats: pd.DataFrame, col: str) -> pd.DataFrame:
ats[col] = pd.to_datetime(ats[col])
mask = (ats[col] >= '1900-01-01') & (ats[col] <= pd.Timestamp('today'))
return ats.loc[mask]
def drop_invalid_devices(ats: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
def drop_invalid_devices(self, ats: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
return ats[ats['DevISOClass'].isin(iso_classes.DevISOClass)]
def remove_screenings_without_exercises(df: pd.DataFrame) -> pd.DataFrame:
def remove_screenings_without_exercises(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[df['ExerciseContent'] != 'nan']
return df
......
......@@ -4,45 +4,6 @@ from pandas._libs.tslibs import Timestamp
from typing import Tuple
from utility import data_dto
def make_citizen_training(df):
df['NumberWeeksSum'] = get_col_cumsum(df, 'NumberWeeks')
df['NumberTrainingSum'] = get_col_cumsum(df, 'NumberTraining')
df['NeedsBaseline'] = get_col_first(df, 'Needs')
df['MeanEvaluationMean'] = get_col_mean(df, 'MeanEvaluation')
df['StdEvaluationMean'] = get_col_mean(df, 'StdEvaluation')
df['NumberTrainingWeekMean'] = get_col_mean(df, 'NumberTrainingWeek')
df['MeanTimeBetweenTrainingMean'] = get_col_mean(df, 'MeanTimeBetweenTraining')
df['NumberCancelsSum'] = get_col_cumsum(df, 'NumberCancels')
df['MeanTimeBetweenCancelsMean'] = get_col_mean(df, 'MeanTimeBetweenCancels')
df['MeanNumberCancelsWeekMean'] = get_col_mean(df, 'MeanNumberCancelsWeek')
df['NeedsMean'] = get_col_mean(df, 'Needs')
df['PhysicsMean'] = get_col_mean(df, 'Physics')
df['NumberExercisesMean'] = get_col_mean(df, 'NumberExercises')
return df
def make_citizen_ats(df):
df['NumberWeeksSum'] = get_col_cumsum(df, 'NumberWeeks')
df['NumberTrainingSum'] = get_col_cumsum(df, 'NumberTraining')
df['NumberAtsMean'] = get_col_mean(df, 'NumberAts')
return df
def get_col_cumsum(df, col):
return np.around(df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform(pd.Series.cumsum), decimals=2)
def get_col_mean(df, col):
return np.around(df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform(pd.Series.mean), decimals=2)
def get_col_max(df, col):
return df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform(pd.Series.max)
def get_col_first(df, col):
return df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform('first')
def get_number_of_falls(df):
df = df.drop_duplicates(["CitizenId", "Date"])
fall_dict = dict(df.groupby(['CitizenId'])['Date'].count())
return fall_dict
def get_number_falls(df: pd.DataFrame, start_date: Timestamp, end_date: Timestamp):
if not df.empty:
df = df[(df['Date'] > start_date) & (df['Date'] < end_date)]
......
......@@ -8,43 +8,24 @@ from typing import Tuple
from utility import data_dto
from pandas.tseries.offsets import DateOffset
from abc import ABC, abstractmethod
def annotate_falls(row, digi_db, cura_db, risk_period):
citizen_id = row['CitizenId']
current_date = pd.Timestamp(row['EndDate'])
end_date = current_date + DateOffset(months=risk_period)
digi_db['EndDate'] = pd.to_datetime(digi_db['EndDate'])
cura_db['Date'] = pd.to_datetime(cura_db['Date'])
timespan_digi_falls = digi_db.loc[(digi_db['EndDate'] >= current_date)
& (digi_db['CitizenId'] == citizen_id)
& (digi_db['EndDate'] <= end_date)]
timespan_cura_falls = cura_db.loc[(cura_db['Date'] >= current_date)
& (cura_db['CitizenId'] == citizen_id)
& (cura_db['Date'] <= end_date)]
if len(timespan_digi_falls) > 0 or len(timespan_cura_falls) > 0:
return 1
return 0
def make_risk_feature(df: pd.DataFrame,
def make_risk_target(df: pd.DataFrame,
cura_falls: pd.DataFrame,
settings: dict):
# Get Cura and DigiRehab falls, respectively
cura_falls = cura_falls[['CitizenId', 'Date']]
digi_falls = df[['CitizenId', 'NeedsReason', 'PhysicsReason', 'EndDate']]
digi_falls = digi_falls.fillna('Ingen')
digi_falls = df[['CitizenId', 'NeedsReason', 'PhysicsReason', 'EndDate']].fillna('Ingen')
digi_falls = digi_falls[digi_falls['NeedsReason'].str.contains("Fald/uheld")
| digi_falls['PhysicsReason'].str.contains("Fald/uheld")]
# Make target by annotating falls in the risk period
risk_period = settings['risk_period_months']
df['Risk'] = df[['CitizenId', 'EndDate']].apply(lambda x:
annotate_falls(x, digi_falls, cura_falls, risk_period), axis=1)
return df
def make_compliance_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
def make_compliance_target(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
df = accumulate_screenings(df, settings)
# Set first screening as baseline
......@@ -71,7 +52,7 @@ def make_compliance_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
return df
def make_complete_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
def make_complete_target(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
df = accumulate_screenings(df, settings)
# Set first screening as baseline
......@@ -90,7 +71,7 @@ def make_complete_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
return df
def make_fall_feature(df, settings):
def make_fall_target(df, settings):
df = accumulate_screenings(df, settings)
# Set first screening as baseline
......@@ -109,6 +90,26 @@ def make_fall_feature(df, settings):
return df
def annotate_falls(row, digi_db, cura_db, risk_period):
citizen_id = row['CitizenId']
current_date = pd.Timestamp(row['EndDate'])
end_date = current_date + DateOffset(months=risk_period)
digi_db['EndDate'] = pd.to_datetime(digi_db['EndDate'])
cura_db['Date'] = pd.to_datetime(cura_db['Date'])
timespan_digi_falls = digi_db.loc[(digi_db['EndDate'] >= current_date)
& (digi_db['CitizenId'] == citizen_id)
& (digi_db['EndDate'] <= end_date)]
timespan_cura_falls = cura_db.loc[(cura_db['Date'] >= current_date)
& (cura_db['CitizenId'] == citizen_id)
& (cura_db['Date'] <= end_date)]
if len(timespan_digi_falls) > 0 or len(timespan_cura_falls) > 0:
return 1
return 0
def accumulate_screenings(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
for group_name, _ in df.groupby(['CitizenId', 'NumberSplit']):
number_session = 0
......
import pandas as pd
import datetime
from tools import cleaner
from tools.cleaner import Cleaner2020
def test_filter_ats_on_ids():
df = pd.DataFrame([i for i in range(10)], columns=['CitizenId'])
ids = [i for i in range(5)]
filtered_list = cleaner.filter_ats_on_ids(df, ids)
filtered_list = Cleaner2020().filter_ats_on_ids(df, ids)
assert not filtered_list.empty
assert len(filtered_list) == 5
......@@ -15,7 +15,7 @@ def test_sort_dataframe():
df = pd.DataFrame({
'CitizenId': [42,41],
'RatingDate': [yesterday, today]})
res = cleaner.sort_dataframe(df, ['CitizenId', 'RatingDate'])
res = Cleaner2020().sort_dataframe(df, ['CitizenId', 'RatingDate'])
assert not res.empty
assert res.iloc[0]['CitizenId'] == 41
assert res.iloc[0]['RatingDate'] == today
......@@ -27,7 +27,7 @@ def test_merge_train_and_patient_data():
df2 = pd.DataFrame({
'PatientId': [3, 2, 1],
'RatingScore': [59, 60, 61]})
res = cleaner.merge_train_and_patient_data(df1, df2, 'PatientId')
res = Cleaner2020().merge_train_and_patient_data(df1, df2, 'PatientId')
assert not res.empty
pd.testing.assert_series_equal(res['PatientId'], df1['PatientId'])
......@@ -35,14 +35,14 @@ def test_remove_citizens_without_valid_id():
df = pd.DataFrame({
'CitizenId': ['0', '0', '37'],
'PatientId': [1, 2, 3]})
res = cleaner.remove_citizens_without_valid_id(df)
res = Cleaner2020().remove_citizens_without_valid_id(df)
assert not res.empty
assert len(res) == 1
def test_remove_citizens_not_in_patient_data():
df_train = pd.DataFrame({'PatientId': [1, 2, 3]})
df_patient = pd.DataFrame({'PatientId': [1]})
res = cleaner.remove_citizens_not_in_patient_data(
res = Cleaner2020().remove_citizens_not_in_patient_data(
df_train, df_patient, 'PatientId')
assert not res.empty
assert len(res) == 1
......@@ -51,18 +51,18 @@ def test_remove_tainted_histories():
df = pd.DataFrame({
'DevHMINumber': ['899,997', '899,989', '899,999'],
'CitizenId': [1, 2, 3]})
res = cleaner.remove_tainted_histories(df)
res = Cleaner2020().remove_tainted_histories(df)
assert not res.empty
assert res.shape == (2,2)
def test_remove_deprecated_device_data():
df = pd.DataFrame({'DevHMINumber': ['899,997', '899,989', '899,999']})
res = cleaner.remove_deprecated_device_data(df)
res = Cleaner2020().remove_deprecated_device_data(df)
assert not res.empty
assert len(res) == 2
def test_remove_rows_with_old_dates():
df = pd.DataFrame({'LendDate': ['1899-02-02', '1900-01-01', '2020-05-27']})
res = cleaner.remove_rows_with_old_dates(df, 'LendDate')
res = Cleaner2020().remove_rows_with_old_dates(df, 'LendDate')
assert not res.empty
assert len(res) == 2
\ No newline at end of file
......@@ -2,7 +2,7 @@ import pytest
import sklearn.utils as skut
import pandas as pd
import paths as pt
from src.tools import preprocessor, feature_maker, classifiers, file_reader
from src.tools import preprocessor, feature_maker, target_maker, classifiers, file_reader
from src.tools.classifiers import RfClassifier, ClassifierResult
@pytest.fixture(scope="module")
......@@ -12,7 +12,7 @@ def get_data():
def test_predict_complete(get_data):
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
df = feature_maker.make_complete_feature(get_data, settings)
df = target_maker.make_complete_target(get_data, settings)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -31,7 +31,7 @@ def test_predict_complete(get_data):
def test_predict_compliance(get_data):
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
df = feature_maker.make_compliance_feature(get_data, settings)
df = target_maker.make_compliance_target(get_data, settings)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -50,7 +50,7 @@ def test_predict_compliance(get_data):
def test_predict_fall(get_data):
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
df = feature_maker.make_fall_feature(get_data, settings)
df = target_maker.make_fall_target(get_data, settings)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
ats_cols = df.filter(regex='Ats', axis=1)
......
from tools import feature_maker
from tools import target_maker
import pandas as pd
import numpy as np
def test_make_complete_feature():
df = get_data(n_rows=100, n_citizens=10)
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
res = feature_maker.make_complete_feature(df, settings)
res = target_maker.make_complete_target(df, settings)
assert res.shape[0] > 0
assert res['Complete'] is not None
def test_make_fall_feature():
df = get_data(n_rows=100, n_citizens=10)
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
res = feature_maker.make_fall_feature(df, settings)
res = target_maker.make_fall_target(df, settings)
assert res.shape[0] > 0
assert res['Fall'] is not None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment