Commit b12d63d8 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

moved some code to notebook, renamed code in tools

parent f2b8c184
Pipeline #67138 passed with stage
in 3 minutes and 50 seconds
This diff is collapsed.
......@@ -37,13 +37,13 @@ class BaseCleaner(ABC):
def clean_fall_data(self, fd):
"""Cleans the fall set"""
def remove_citizens_not_in_patient_data(train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id: str) -> pd.DataFrame:
def remove_citizens_not_in_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
id: str) -> pd.DataFrame:
data = train_data[train_data[id].isin(patient_data[id].unique())]
return data
def remove_citizens_without_valid_id(df: pd.DataFrame) -> pd.DataFrame:
def remove_citizens_without_valid_id(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[df['CitizenId'] != "0000000000"]
df = df[df['CitizenId'] != '0']
df = df[df['CitizenId'] != "#VALUE!"]
......@@ -51,34 +51,34 @@ class BaseCleaner(ABC):
df = df.dropna(subset=['CitizenId'])
return df
def merge_train_and_patient_data(train_data: pd.DataFrame,
def merge_train_and_patient_data(self, train_data: pd.DataFrame,
patient_data: pd.DataFrame,
key: str) -> pd.DataFrame:
return pd.merge(train_data, patient_data, on=key)
def sort_dataframe(data: pd.DataFrame, by: str) -> pd.DataFrame:
def sort_dataframe(self, data: pd.DataFrame, by: str) -> pd.DataFrame:
return data.sort_values(by)
def filter_ats_on_ids(ats: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
def filter_ats_on_ids(self, ats: pd.DataFrame, ids: List[str]) -> pd.DataFrame:
return ats[ats['CitizenId'].isin(ids)]
def remove_tainted_histories(ats: pd.DataFrame) -> pd.DataFrame:
def remove_tainted_histories(self, ats: pd.DataFrame) -> pd.DataFrame:
tained_ids = ats[ats['DevHMINumber'] == '899,999']['CitizenId'].unique()
ats = ats[np.logical_not(ats['CitizenId'].isin(tained_ids))]
return ats
def remove_deprecated_device_data(ats: pd.DataFrame) -> pd.DataFrame:
def remove_deprecated_device_data(self, ats: pd.DataFrame) -> pd.DataFrame:
return ats[ats['DevHMINumber'] != '899,999']
def remove_rows_with_old_dates(ats: pd.DataFrame, col: str) -> pd.DataFrame:
def remove_rows_with_old_dates(self, ats: pd.DataFrame, col: str) -> pd.DataFrame:
ats[col] = pd.to_datetime(ats[col])
mask = (ats[col] >= '1900-01-01') & (ats[col] <= pd.Timestamp('today'))
return ats.loc[mask]
def drop_invalid_devices(ats: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
def drop_invalid_devices(self, ats: pd.DataFrame, iso_classes: pd.DataFrame) -> pd.DataFrame:
return ats[ats['DevISOClass'].isin(iso_classes.DevISOClass)]
def remove_screenings_without_exercises(df: pd.DataFrame) -> pd.DataFrame:
def remove_screenings_without_exercises(self, df: pd.DataFrame) -> pd.DataFrame:
df = df[df['ExerciseContent'] != 'nan']
return df
......
......@@ -4,45 +4,6 @@ from pandas._libs.tslibs import Timestamp
from typing import Tuple
from utility import data_dto
def make_citizen_training(df):
df['NumberWeeksSum'] = get_col_cumsum(df, 'NumberWeeks')
df['NumberTrainingSum'] = get_col_cumsum(df, 'NumberTraining')
df['NeedsBaseline'] = get_col_first(df, 'Needs')
df['MeanEvaluationMean'] = get_col_mean(df, 'MeanEvaluation')
df['StdEvaluationMean'] = get_col_mean(df, 'StdEvaluation')
df['NumberTrainingWeekMean'] = get_col_mean(df, 'NumberTrainingWeek')
df['MeanTimeBetweenTrainingMean'] = get_col_mean(df, 'MeanTimeBetweenTraining')
df['NumberCancelsSum'] = get_col_cumsum(df, 'NumberCancels')
df['MeanTimeBetweenCancelsMean'] = get_col_mean(df, 'MeanTimeBetweenCancels')
df['MeanNumberCancelsWeekMean'] = get_col_mean(df, 'MeanNumberCancelsWeek')
df['NeedsMean'] = get_col_mean(df, 'Needs')
df['PhysicsMean'] = get_col_mean(df, 'Physics')
df['NumberExercisesMean'] = get_col_mean(df, 'NumberExercises')
return df
def make_citizen_ats(df):
df['NumberWeeksSum'] = get_col_cumsum(df, 'NumberWeeks')
df['NumberTrainingSum'] = get_col_cumsum(df, 'NumberTraining')
df['NumberAtsMean'] = get_col_mean(df, 'NumberAts')
return df
def get_col_cumsum(df, col):
return np.around(df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform(pd.Series.cumsum), decimals=2)
def get_col_mean(df, col):
return np.around(df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform(pd.Series.mean), decimals=2)
def get_col_max(df, col):
return df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform(pd.Series.max)
def get_col_first(df, col):
return df.groupby(['CitizenId', 'HasCompletedSession'])[col].transform('first')
def get_number_of_falls(df):
df = df.drop_duplicates(["CitizenId", "Date"])
fall_dict = dict(df.groupby(['CitizenId'])['Date'].count())
return fall_dict
def get_number_falls(df: pd.DataFrame, start_date: Timestamp, end_date: Timestamp):
if not df.empty:
df = df[(df['Date'] > start_date) & (df['Date'] < end_date)]
......
......@@ -8,43 +8,24 @@ from typing import Tuple
from utility import data_dto
from pandas.tseries.offsets import DateOffset
from abc import ABC, abstractmethod
def annotate_falls(row, digi_db, cura_db, risk_period):
citizen_id = row['CitizenId']
current_date = pd.Timestamp(row['EndDate'])
end_date = current_date + DateOffset(months=risk_period)
digi_db['EndDate'] = pd.to_datetime(digi_db['EndDate'])
cura_db['Date'] = pd.to_datetime(cura_db['Date'])
timespan_digi_falls = digi_db.loc[(digi_db['EndDate'] >= current_date)
& (digi_db['CitizenId'] == citizen_id)
& (digi_db['EndDate'] <= end_date)]
timespan_cura_falls = cura_db.loc[(cura_db['Date'] >= current_date)
& (cura_db['CitizenId'] == citizen_id)
& (cura_db['Date'] <= end_date)]
if len(timespan_digi_falls) > 0 or len(timespan_cura_falls) > 0:
return 1
return 0
def make_risk_feature(df: pd.DataFrame,
def make_risk_target(df: pd.DataFrame,
cura_falls: pd.DataFrame,
settings: dict):
# Get Cura and DigiRehab falls, respectively
cura_falls = cura_falls[['CitizenId', 'Date']]
digi_falls = df[['CitizenId', 'NeedsReason', 'PhysicsReason', 'EndDate']]
digi_falls = digi_falls.fillna('Ingen')
digi_falls = df[['CitizenId', 'NeedsReason', 'PhysicsReason', 'EndDate']].fillna('Ingen')
digi_falls = digi_falls[digi_falls['NeedsReason'].str.contains("Fald/uheld")
| digi_falls['PhysicsReason'].str.contains("Fald/uheld")]
# Make target by annotating falls in the risk period
risk_period = settings['risk_period_months']
df['Risk'] = df[['CitizenId', 'EndDate']].apply(lambda x:
annotate_falls(x, digi_falls, cura_falls, risk_period), axis=1)
return df
def make_compliance_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
def make_compliance_target(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
df = accumulate_screenings(df, settings)
# Set first screening as baseline
......@@ -71,7 +52,7 @@ def make_compliance_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
return df
def make_complete_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
def make_complete_target(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
df = accumulate_screenings(df, settings)
# Set first screening as baseline
......@@ -90,7 +71,7 @@ def make_complete_feature(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
return df
def make_fall_feature(df, settings):
def make_fall_target(df, settings):
df = accumulate_screenings(df, settings)
# Set first screening as baseline
......@@ -109,6 +90,26 @@ def make_fall_feature(df, settings):
return df
def annotate_falls(row, digi_db, cura_db, risk_period):
citizen_id = row['CitizenId']
current_date = pd.Timestamp(row['EndDate'])
end_date = current_date + DateOffset(months=risk_period)
digi_db['EndDate'] = pd.to_datetime(digi_db['EndDate'])
cura_db['Date'] = pd.to_datetime(cura_db['Date'])
timespan_digi_falls = digi_db.loc[(digi_db['EndDate'] >= current_date)
& (digi_db['CitizenId'] == citizen_id)
& (digi_db['EndDate'] <= end_date)]
timespan_cura_falls = cura_db.loc[(cura_db['Date'] >= current_date)
& (cura_db['CitizenId'] == citizen_id)
& (cura_db['Date'] <= end_date)]
if len(timespan_digi_falls) > 0 or len(timespan_cura_falls) > 0:
return 1
return 0
def accumulate_screenings(df: pd.DataFrame, settings: dict) -> pd.DataFrame:
for group_name, _ in df.groupby(['CitizenId', 'NumberSplit']):
number_session = 0
......
import pandas as pd
import datetime
from tools import cleaner
from tools.cleaner import Cleaner2020
def test_filter_ats_on_ids():
df = pd.DataFrame([i for i in range(10)], columns=['CitizenId'])
ids = [i for i in range(5)]
filtered_list = cleaner.filter_ats_on_ids(df, ids)
filtered_list = Cleaner2020().filter_ats_on_ids(df, ids)
assert not filtered_list.empty
assert len(filtered_list) == 5
......@@ -15,7 +15,7 @@ def test_sort_dataframe():
df = pd.DataFrame({
'CitizenId': [42,41],
'RatingDate': [yesterday, today]})
res = cleaner.sort_dataframe(df, ['CitizenId', 'RatingDate'])
res = Cleaner2020().sort_dataframe(df, ['CitizenId', 'RatingDate'])
assert not res.empty
assert res.iloc[0]['CitizenId'] == 41
assert res.iloc[0]['RatingDate'] == today
......@@ -27,7 +27,7 @@ def test_merge_train_and_patient_data():
df2 = pd.DataFrame({
'PatientId': [3, 2, 1],
'RatingScore': [59, 60, 61]})
res = cleaner.merge_train_and_patient_data(df1, df2, 'PatientId')
res = Cleaner2020().merge_train_and_patient_data(df1, df2, 'PatientId')
assert not res.empty
pd.testing.assert_series_equal(res['PatientId'], df1['PatientId'])
......@@ -35,14 +35,14 @@ def test_remove_citizens_without_valid_id():
df = pd.DataFrame({
'CitizenId': ['0', '0', '37'],
'PatientId': [1, 2, 3]})
res = cleaner.remove_citizens_without_valid_id(df)
res = Cleaner2020().remove_citizens_without_valid_id(df)
assert not res.empty
assert len(res) == 1
def test_remove_citizens_not_in_patient_data():
df_train = pd.DataFrame({'PatientId': [1, 2, 3]})
df_patient = pd.DataFrame({'PatientId': [1]})
res = cleaner.remove_citizens_not_in_patient_data(
res = Cleaner2020().remove_citizens_not_in_patient_data(
df_train, df_patient, 'PatientId')
assert not res.empty
assert len(res) == 1
......@@ -51,18 +51,18 @@ def test_remove_tainted_histories():
df = pd.DataFrame({
'DevHMINumber': ['899,997', '899,989', '899,999'],
'CitizenId': [1, 2, 3]})
res = cleaner.remove_tainted_histories(df)
res = Cleaner2020().remove_tainted_histories(df)
assert not res.empty
assert res.shape == (2,2)
def test_remove_deprecated_device_data():
df = pd.DataFrame({'DevHMINumber': ['899,997', '899,989', '899,999']})
res = cleaner.remove_deprecated_device_data(df)
res = Cleaner2020().remove_deprecated_device_data(df)
assert not res.empty
assert len(res) == 2
def test_remove_rows_with_old_dates():
df = pd.DataFrame({'LendDate': ['1899-02-02', '1900-01-01', '2020-05-27']})
res = cleaner.remove_rows_with_old_dates(df, 'LendDate')
res = Cleaner2020().remove_rows_with_old_dates(df, 'LendDate')
assert not res.empty
assert len(res) == 2
\ No newline at end of file
......@@ -2,7 +2,7 @@ import pytest
import sklearn.utils as skut
import pandas as pd
import paths as pt
from src.tools import preprocessor, feature_maker, classifiers, file_reader
from src.tools import preprocessor, feature_maker, target_maker, classifiers, file_reader
from src.tools.classifiers import RfClassifier, ClassifierResult
@pytest.fixture(scope="module")
......@@ -12,7 +12,7 @@ def get_data():
def test_predict_complete(get_data):
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
df = feature_maker.make_complete_feature(get_data, settings)
df = target_maker.make_complete_target(get_data, settings)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -31,7 +31,7 @@ def test_predict_complete(get_data):
def test_predict_compliance(get_data):
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
df = feature_maker.make_compliance_feature(get_data, settings)
df = target_maker.make_compliance_target(get_data, settings)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
ats_cols = df.filter(regex='Ats', axis=1)
......@@ -50,7 +50,7 @@ def test_predict_compliance(get_data):
def test_predict_fall(get_data):
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
df = feature_maker.make_fall_feature(get_data, settings)
df = target_maker.make_fall_target(get_data, settings)
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=10)
ats_cols = df.filter(regex='Ats', axis=1)
......
from tools import feature_maker
from tools import target_maker
import pandas as pd
import numpy as np
def test_make_complete_feature():
df = get_data(n_rows=100, n_citizens=10)
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
res = feature_maker.make_complete_feature(df, settings)
res = target_maker.make_complete_target(df, settings)
assert res.shape[0] > 0
assert res['Complete'] is not None
def test_make_fall_feature():
df = get_data(n_rows=100, n_citizens=10)
settings = dict({'threshold_weeks': 8, 'threshold_training': 10})
res = feature_maker.make_fall_feature(df, settings)
res = target_maker.make_fall_target(df, settings)
assert res.shape[0] > 0
assert res['Fall'] is not None
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment