Commit 15a487bc authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

Added a model for who completes

parent 0b5d9c96
Pipeline #25394 failed with stage
in 2 minutes and 23 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
df = pd.read_csv('../data/interim/X_test_scale_ft.csv')
```
%% Cell type:code id: tags:
``` python
df.columns
```
%% Output
Index(['CitizenId', 'PatientId', 'Sex', 'BirthYear', 'Age', 'StartDate',
'EndDate', 'nWeeks', 'MeanEvaluation', 'StdEvaluation', 'MinEvaluation',
'MaxEvaluation', 'nTraining', 'nTrainingOptimal', 'nTrainingPrWeek',
'nTrainingPrWeekMax', 'nTrainingPrWeekMin', 'nWeeksWithTrainings',
'TimeBetweenTrainingsAvg', 'nWeeksWithTrainingsIn12Weeks', 'SP-START',
'SP-END', 'SP-ALL', 'nWeeksWithoutTrainings', 'nCancellations',
'TimeBetweenCancelsAvg', 'nCancellationsPrWeekAVG',
'nCancellationsPrWeekMax', 'nCancellationsPrWeekMin',
'NumberATsRunning', 'Cluster', 'HasRollator', 'HasRaisedToiletSeat',
'HasShowerStool', 'HasRaisedToiletSeatAndShowerStool', 'NeedsStart',
'NeedsEnd', 'NeedsDifference', 'PhysicsStart', 'PhysicsEnd',
'PhysicsDifference', 'Physics', 'RehabIndicator', 'DevicesUnique_0',
'DevicesUnique_043303', 'DevicesCount_0', 'DevicesCount_043303'],
dtype='object')
%% Cell type:code id: tags:
``` python
df.head()
```
%% Output
CitizenId PatientId Sex BirthYear Age StartDate EndDate nWeeks \
0 182361925722 42127 0 1936 82 736759 736877 16.9
1 166115733051 42135 1 1935 83 736815 736957 20.3
2 179311002207 41318 1 1948 69 736513 736639 18.0
3 168243427623 39062 1 1966 49 735897 735985 13.0
4 659524639842 41869 0 1936 82 736880 737062 26.0
MeanEvaluation StdEvaluation ... NeedsDifference PhysicsStart \
0 3.0 0.7 ... 17.0 27.0
1 5.7 0.8 ... 8.0 47.0
2 3.2 0.9 ... 5.0 50.0
3 3.4 0.7 ... -12.0 18.0
4 4.4 0.5 ... -1.0 87.0
PhysicsEnd PhysicsDifference Physics RehabIndicator DevicesUnique_0 \
0 13.0 -14.0 0 1.555556 0
1 44.0 -3.0 0 0.021277 1
2 42.0 -8.0 0 1.220000 1
3 50.0 32.0 1 2.444444 1
4 91.0 4.0 1 0.011494 1
DevicesUnique_043303 DevicesCount_0 DevicesCount_043303
0 1 0 1
1 0 1 0
2 0 1 0
3 0 1 0
4 0 1 0
[5 rows x 47 columns]
%% Cell type:code id: tags:
``` python
df.Cluster.value_counts()
```
%% Output
0 9
Name: Cluster, dtype: int64
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
pd.reset_option('^display.', silent=True)
pd.set_option('display.max_columns', 100)
df = pd.read_csv('../data/interim/timeseries.csv')
df = df.drop(['NeedsStartReason', 'NeedsReason', 'PhysicsStartReason', 'PhysicsReason'], axis=1)
df['NumberWeeksSum'] = df.groupby('CitizenId')['NumberWeeks'].transform(pd.Series.cumsum)
df['NumberTrainingSum'] = df.groupby('CitizenId')['NumberTraining'].transform(pd.Series.cumsum)
df['NeedsStartBaseline'] = df.groupby('CitizenId')["NeedsStart"].transform('first')
df = df[df['NeedsStartBaseline'] != 0]
df_completed = df.loc[(df['NumberWeeksSum'] >= 8) & (df['NumberTrainingSum'] >= 7)] \
.drop_duplicates(subset='CitizenId').reset_index(drop=True)
df_not_completed = df.drop(df[df.CitizenId.isin(df_completed.CitizenId)].index) \
.drop_duplicates(subset='CitizenId', keep='last').reset_index(drop=True)
df_improved = df_completed.loc[(df_completed['NeedsStartBaseline'] - df_completed['NeedsEnd']) /
df_completed['NeedsStartBaseline'] >= 0.1]
df_not_improved = df_completed.drop(df_completed[df_completed.CitizenId.isin(df_improved.CitizenId)].index)
unrelated_cols = ['Exercises', 'LastStatus', 'LastStatusDate', 'DevicesCount', 'DevicesUnique']
df_completed_prep = df_completed.drop(unrelated_cols, axis=1)
df_not_completed_prep = df_not_completed.drop(unrelated_cols, axis=1)
df_improved_prep = df_improved.drop(unrelated_cols, axis=1)
df_not_improved_prep = df_not_improved.drop(unrelated_cols, axis=1)
profile_comp = ProfileReport(df_completed_prep)
profile_fail = ProfileReport(df_not_completed_prep)
profile_improved = ProfileReport(df_improved_prep)
profile_not_improved = ProfileReport(df_not_improved_prep)
profile_comp.to_file(f"..\\reports\profile_completed.html")
profile_fail.to_file(f"..\\reports\profile_fail.html")
profile_improved.to_file(f"..\\reports\profile_improved.html")
profile_not_improved.to_file(f"..\\reports\profile_not_improved.html")
```
%% Output
Summarize dataset: 100%|██████████| 59/59 [01:56<00:00, 1.97s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:09<00:00, 9.17s/it]\nRender HTML: 100%|██████████| 1/1 [00:15<00:00, 15.99s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 3.72it/s]\nSummarize dataset: 100%|██████████| 59/59 [01:53<00:00, 1.92s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:10<00:00, 10.56s/it]\nRender HTML: 100%|██████████| 1/1 [00:13<00:00, 13.27s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 4.43it/s]\nSummarize dataset: 100%|██████████| 60/60 [01:52<00:00, 1.88s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.62s/it]\nRender HTML: 100%|██████████| 1/1 [00:12<00:00, 12.80s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 5.16it/s]\nSummarize dataset: 100%|██████████| 60/60 [02:05<00:00, 2.09s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:09<00:00, 9.75s/it]\nRender HTML: 100%|██████████| 1/1 [00:13<00:00, 13.34s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 4.76it/s]\n
This diff is collapsed.