Commit 9841f63c authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

corrected unit tests and few bugs

parent 7a57387f
Pipeline #25779 passed with stage
in 2 minutes and 11 seconds
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import src.models.feature_maker as fm
pd.reset_option('^display.', silent=True)
df = pd.read_csv('../data/interim/timeseries.csv')
df = df.drop(['NeedsStartReason', 'NeedsReason', 'PhysicsStartReason', 'PhysicsReason'], axis=1)
print(f"Number of screenings: {len(df)}")
print(f"Number of citizens: {df.CitizenId.nunique()}")
df['NumberWeeksSum'] = df.groupby('CitizenId')['NumberWeeks'].transform(pd.Series.cumsum)
df['NumberTrainingSum'] = df.groupby('CitizenId')['NumberTraining'].transform(pd.Series.cumsum)
df['NeedsStartBaseline'] = df.groupby('CitizenId')["NeedsStart"].transform('first')
df = df[df['NeedsStartBaseline'] != 0]
df = fm.assign_number_completed(df)
df_completed = df.loc[(df['NumberWeeksSum'] >= 8) & (df['NumberTrainingSum'] >= 7)] \
.drop_duplicates(subset='CitizenId').reset_index(drop=True)
df_completed = df.loc[(df['NumberWeeksSum'] >= 8) & (df['NumberTrainingSum'] >= 7)].drop_duplicates(subset='CitizenId').reset_index(drop=True)
print(f"Number of citizens that completed: {len(df_completed)}")
df_failed = df.drop(df[df.CitizenId.isin(df_completed.CitizenId)].index) \
.drop_duplicates(subset='CitizenId', keep='last').reset_index(drop=True)
print(f"Number of citizens that did not complete: {len(df_failed)}")
df_improved = df_completed.loc[(df_completed['NeedsStartBaseline'] - df_completed['NeedsEnd']) /
df_completed['NeedsStartBaseline'] >= 0.1]
print(f"Number of citizens completed and improved: {len(df_improved)}")
df_not_improved = df_completed.drop(df_completed[df_completed.CitizenId.isin(df_improved.CitizenId)].index)
print(f"Number of citizens completed and did not improve: {len(df_not_improved)}")
```
%% Output
Number of screenings: 1793\nNumber of citizens: 420\nNumber of citizens that completed: 233\nNumber of citizens that did not complete: 177\nNumber of citizens completed and improved: 107\nNumber of citizens completed and did not improve: 126\n
%% Cell type:code id: tags:
``` python
df_completed.head()
```
%% Output
CitizenId PatientId Sex Age NumberScreening StartDate \\n0 657788503758 42442 0 89 4 2018-06-04 \n1 659524639842 41869 0 82 5 2018-01-09 \n2 4077528813276 40725 0 83 4 2017-02-01 \n3 2767655006661 39157 1 89 3 2016-01-29 \n4 1957778298801 39223 1 89 3 2016-03-15 \n\n EndDate LastStatusDate NumberWeeks MeanEvaluation ... \\n0 2018-07-16 2018-07-16 6.00 3.6 ... \n1 2018-02-01 2020-08-14 3.29 4.1 ... \n2 2017-12-01 2017-03-15 43.29 4.9 ... \n3 2016-03-13 2020-08-14 6.29 3.6 ... \n4 2016-04-26 2020-08-14 6.00 5.2 ... \n\n PhysicsDifference PhysicsIndicator RehabIndicator \\n0 -9.0 0 2.78 \n1 -17.0 0 0.13 \n2 19.0 1 2.00 \n3 10.0 1 1.07 \n4 -2.0 0 1.31 \n\n Exercises NumberExercises \\n0 ['289957', '289958', '289959', '289960', '2899... 5 \n1 ['279431', '279432', '279433', '279434', '2794... 9 \n2 ['262621', '262622', '262623', '262624', '2626... 9 \n3 ['239401', '239402', '239405', '239403'] 4 \n4 ['241171', '241172', '241173', '241174', '2411... 9 \n\n LastStatus NumberWeeksSum NumberTrainingSum NeedsStartBaseline \\n0 SignificantProgress 10.71 16 56.0 \n1 None 10.86 24 32.0 \n2 Terminated 50.43 17 57.0 \n3 None 11.86 31 31.0 \n4 None 11.71 15 23.0 \n\n NumberCompleted \n0 1.0 \n1 1.0 \n2 1.0 \n3 1.0 \n4 1.0 \n\n[5 rows x 53 columns]
%% Cell type:code id: tags:
``` python
df_completed.Age.mean()
```
%% Output
81.1931330472103
%% Cell type:code id: tags:
``` python
df_failed.Age.mean()
```
%% Output
81.50847457627118
%% Cell type:code id: tags:
``` python
# Show which cluster most citizens belong to
df_completed['Cluster'].value_counts().loc[:3]
```
%% Output
0 47\n2 23\n24 20\n48 11\n10 10\n11 9\n25 9\n7 8\n17 8\n9 7\n4 7\n37 6\n3 6\nName: Cluster, dtype: int64
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
df_failed['Cluster'].value_counts().loc[:3]
```
%% Output
0 32\n2 26\n48 14\n24 10\n17 8\n4 7\n37 7\n7 7\n11 7\n23 6\n1 4\n3 4\nName: Cluster, dtype: int64
%% Cell type:code id: tags:
``` python
df_completed.HasRollator.value_counts()
```
%% Output
True 118\nFalse 115\nName: HasRollator, dtype: int64
%% Cell type:code id: tags:
``` python
df_failed.HasRollator.value_counts()
```
%% Output
True 98\nFalse 79\nName: HasRollator, dtype: int64
%% Cell type:code id: tags:
``` python
df_completed.HasRaisedToiletSeat.value_counts()
```
%% Output
False 118\nTrue 115\nName: HasRaisedToiletSeat, dtype: int64
%% Cell type:code id: tags:
``` python
print(df_completed.HasRollator.value_counts()[1]/len(df_completed))
print(df_completed.HasShowerStool.value_counts()[1]/len(df_completed))
```
%% Output
0.5064377682403434\n0.36909871244635195\n
%% Cell type:code id: tags:
``` python
print(df_failed.HasRollator.value_counts()[1]/len(df_failed))
print(df_failed.HasShowerStool.value_counts()[1]/len(df_failed))
```
%% Output
0.5536723163841808\n0.4463276836158192\n
%% Cell type:code id: tags:
``` python
# Calculate statistics for exercises
print(df_completed.Exercises.apply(lambda x: len(x)/10).describe(), '\n')
print(df_failed.Exercises.apply(lambda x: len(x)/10).describe())
```
%% Output
count 233.000000\nmean 6.742489\nstd 1.796248\nmin 1.000000\n25% 5.000000\n50% 7.000000\n75% 8.000000\nmax 9.000000\nName: Exercises, dtype: float64 \n\ncount 177.000000\nmean 6.310734\nstd 2.139867\nmin 1.000000\n25% 5.000000\n50% 7.000000\n75% 8.000000\nmax 9.000000\nName: Exercises, dtype: float64\n
%% Cell type:code id: tags:
``` python
import ast
def get_ats_list(devices):
return pd.Series([str(inner) for item in devices for inner in ast.literal_eval(item)])
ats_completed = get_ats_list(df_completed.ATS)
ats_failed = get_ats_list(df_failed.ATS)
# Print top ATS
print(ats_completed.value_counts().head(10), "\n")
print(ats_failed.value_counts().head(10))
```
%% Output
120606 256\n122203 136\n043303 131\n093307 124\n091203 103\n091218 98\n043306 94\n242103 76\n222718 67\n181210 56\ndtype: int64 \n\n120606 193\n093307 108\n043303 71\n122203 70\n043306 67\n222718 67\n091218 65\n091203 64\n181210 46\n123103 41\ndtype: int64\n
%% Cell type:code id: tags:
``` python
# Calculate total number of ATS
print(ats_completed.value_counts()[1:].sum()/len(ats_completed))
print(ats_failed.value_counts()[1:].sum()/len(ats_failed))
```
%% Output
0.8499413833528722\n0.8454763811048839\n
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
top_ats_completed = ats_completed.value_counts()[10::-1]
top_ats_failed = ats_failed.value_counts()[10::-1]
plt.figure(figsize=(10,4))
top_ats_completed.plot(kind='barh')
plt.xlabel('ATS Id')
plt.ylabel('Frequency')
plt.title('ATS frequency for citizens that improved')
plt.figure(figsize=(10,4))
top_ats_failed.plot(kind='barh')
plt.xlabel('ATS Id')
plt.ylabel('Frequency')
plt.title('ATS frequency for citizens that failed')
```
%% Output
Text(0.5, 1.0, 'ATS frequency for citizens that failed')
......
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import KernelPCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
rf_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', RandomForestClassifier(random_state = 0))])
lr_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', LogisticRegression(random_state = 0))])
knn_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', KNeighborsClassifier())])
svc_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', SVC(random_state=0, gamma='scale', probability=True))])
gnb_pipeline = Pipeline([('scaler', StandardScaler()),
('clf', GaussianNB())])
X = pd.read_csv(f'../data/processed/X_completes.csv')
y = pd.read_csv(f'../data/processed/y_completes.csv')
y_arr = np.array(y).reshape(-1)
print(f"No complete/complete: {pd.Series(y_arr).value_counts().tolist()}\n")
print(f"Completes/fails: {pd.Series(y_arr).value_counts().tolist()}\n")
X_train, X_test, y_train, y_test = train_test_split(
X, y_arr, test_size=0.2, stratify=y, random_state=0)
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}\n")
print(f"Classification Report:\n {classification_report(y_test, y_pred)}\n")
```
%% Output
No complete/complete: [233, 177]\n\nConfusion Matrix:\n [[22 13]\n [ 9 38]]\n\nClassification Report:\n precision recall f1-score support\n\n 0 0.71 0.63 0.67 35\n 1 0.75 0.81 0.78 47\n\n accuracy 0.73 82\n macro avg 0.73 0.72 0.72 82\nweighted avg 0.73 0.73 0.73 82\n\n\n
Completes/fails: [233, 177]\n\nConfusion Matrix:\n [[26 9]\n [ 9 38]]\n\nClassification Report:\n precision recall f1-score support\n\n 0 0.74 0.74 0.74 35\n 1 0.81 0.81 0.81 47\n\n accuracy 0.78 82\n macro avg 0.78 0.78 0.78 82\nweighted avg 0.78 0.78 0.78 82\n\n\n
%% Cell type:code id: tags:
``` python
cv_scores_acc = cross_val_score(rf_pipeline, X, y_arr, scoring='accuracy', cv=5)
cv_scores_ba = cross_val_score(rf_pipeline, X, y_arr, scoring='balanced_accuracy', cv=5)
cv_scores_f1 = cross_val_score(rf_pipeline, X, y_arr, scoring='f1', cv=5)
cv_scores_roc_auc = cross_val_score(rf_pipeline, X, y_arr, scoring='roc_auc', cv=5)
print("ACC CV average: %0.2f (+/- %0.2f)" % (cv_scores_acc.mean(), cv_scores_acc.std() * 2))
print("BA CV average: %0.2f (+/- %0.2f)" % (cv_scores_ba.mean(), cv_scores_ba.std() * 2))
print("F1 CV average: %0.2f (+/- %0.2f)" % (cv_scores_f1.mean(), cv_scores_f1.std() * 2))
print("ROC_AUC CV average: %0.2f (+/- %0.2f)" % (cv_scores_roc_auc.mean(), cv_scores_roc_auc.std() * 2))
```
%% Output
ACC CV average: 0.80 (+/- 0.07)\nBA CV average: 0.80 (+/- 0.07)\nF1 CV average: 0.83 (+/- 0.07)\nROC_AUC CV average: 0.89 (+/- 0.04)\n
%% Cell type:code id: tags:
``` python
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
def get_roc_scores(pipeline, X, y):
y_probas = cross_val_predict(pipeline, X, y, cv=5, method="predict_proba")
fpr, tpr, _ = roc_curve(y, y_probas[:,1])
return fpr, tpr
from sklearn.metrics import roc_curve, auc, roc_auc_score
fpr_rf, tpr_rf = get_roc_scores(rf_pipeline, X_train, y_train)
fpr_lr, tpr_lr = get_roc_scores(lr_pipeline, X_train, y_train)
fpr_knn, tpr_knn = get_roc_scores(knn_pipeline, X_train, y_train)
fpr_svc, tpr_svc = get_roc_scores(svc_pipeline, X_train, y_train)
fpr_gnb, tpr_gnb = get_roc_scores(gnb_pipeline, X_train, y_train)
plt.figure(figsize=(8,6))
plot_roc_curve(fpr_rf, tpr_rf, "Random Forest")
plot_roc_curve(fpr_lr, tpr_lr, 'Logistic Regression')
plot_roc_curve(fpr_knn, tpr_knn, 'KNeighbors')
plot_roc_curve(fpr_svc, tpr_svc, 'SVC-RBF')
plot_roc_curve(fpr_gnb, tpr_gnb, 'Gaussian Naive-Bayes')
plt.legend(loc="lower right")
plt.title("ROC plot")
plt.ylabel("TPR")
plt.xlabel("FPR")
plt.show()
```
%% Output
%% Cell type:code id: tags:
``` python
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score
y_probas_rf = cross_val_predict(rf_pipeline, X_train, y_train, cv=5, method="predict_proba")
for threshold in [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]:
y_scores_new = (y_probas_rf[:,1] > threshold)
print(f'Threshold: {threshold}, accuracy: {np.around(accuracy_score(y_train, y_scores_new), decimals=3)}, recall: {np.around(recall_score(y_train, y_scores_new), decimals=3)}, CM:\n{confusion_matrix(y_train, y_scores_new)}\n')
```
%% Output
Threshold: 0.3, accuracy: 0.732, recall: 0.957, CM:\n[[ 62 80]\n [ 8 178]]\n\nThreshold: 0.35, accuracy: 0.753, recall: 0.952, CM:\n[[ 70 72]\n [ 9 177]]\n\nThreshold: 0.4, accuracy: 0.744, recall: 0.903, CM:\n[[ 76 66]\n [ 18 168]]\n\nThreshold: 0.45, accuracy: 0.777, recall: 0.871, CM:\n[[ 93 49]\n [ 24 162]]\n\nThreshold: 0.5, accuracy: 0.796, recall: 0.839, CM:\n[[105 37]\n [ 30 156]]\n\nThreshold: 0.55, accuracy: 0.796, recall: 0.801, CM:\n[[112 30]\n [ 37 149]]\n\nThreshold: 0.6, accuracy: 0.808, recall: 0.753, CM:\n[[125 17]\n [ 46 140]]\n\n
%% Cell type:code id: tags:
``` python
importances = rf_pipeline.named_steps['clf'].feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
print("%d. Feature %s (%f)" % (f + 1, X_train.columns[indices[f]],
importances[indices[f]]))
```
%% Output
1. Feature MeanTimeBetweenTrainingMean (0.161751)\n2. Feature NumberTrainingWeekMean (0.148133)\n3. Feature StdEvaluationMean (0.128963)\n4. Feature MeanEvaluationMean (0.099833)\n5. Feature PhysicsDifferenceMean (0.072099)\n6. Feature NeedsDifferenceMean (0.055654)\n7. Feature MeanNumberCancelsWeekMean (0.051806)\n8. Feature NeedsStartBaseline (0.046214)\n9. Feature Age (0.042812)\n10. Feature NumberExercisesMean (0.040308)\n11. Feature NumberAtsMean (0.036406)\n12. Feature Cluster (0.035268)\n13. Feature MeanTimeBetweenCancelsMean (0.019122)\n14. Feature NumberCancelsSum (0.014293)\n15. Feature HadEmergencySystem (0.009363)\n16. Feature HadRollator (0.008866)\n17. Feature HadBedWithEngine (0.008287)\n18. Feature HadShowerStool (0.008100)\n19. Feature Sex (0.007593)\n20. Feature HadWheelchair (0.005128)\n21. Feature HadRaisedToiletSeat (0.000000)\n22. Feature HadSeatCushion (0.000000)\n
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10,10))
pd.Series(importances[indices][::-1],
X_train.columns[indices][::-1]).plot(kind='barh')
```
%% Output
<AxesSubplot:>
%% Cell type:code id: tags:
``` python
from sklearn.metrics import log_loss
train_errors, test_errors = [], []
for m in range(1, 200):
rf_pipeline.fit(X_train[:m], y_train[:m])
y_train_predict = rf_pipeline.predict_proba(X_train[:m])
y_test_predict = rf_pipeline.predict_proba(X_test)
log_loss_train = log_loss(y_train[:m], y_train_predict, eps=1e-15,
labels=['no_complete', 'complete'])
log_loss_test = log_loss(y_test, y_test_predict, eps=1e-15)
train_errors.append(np.around(log_loss_train, decimals=3))
test_errors.append(np.around(log_loss_test, decimals=3))
print(f'Mean train error: {np.mean(train_errors[-5:])}')
print(f'Mean test error: {np.mean(test_errors[-5:])}')
```
%% Output
Mean train error: 1.783\nMean test error: 0.49399999999999994\n
%% Cell type:code id: tags:
``` python
plt.figure(figsize=(10,6))
plt.plot(train_errors, "r-+", linewidth=2, label="train")
plt.plot(test_errors, "b-", linewidth=2, label="test")
plt.title("Learning curves for RF classifier")
plt.ylabel("Log loss")
plt.xlabel('Training set size')
plt.legend(loc="upper right")
plt.yscale('log')
plt.grid(True)
```
%% Output
......
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import src.models.feature_maker as fm
pd.reset_option('^display.', silent=True)
df = pd.read_csv('../data/interim/timeseries.csv')
df = fm.assign_number_completed(df)
```
%% Cell type:code id: tags:
df['NeedsStartBaseline'] = df.groupby('CitizenId')["NeedsStart"].transform('first')
df = df[df['NeedsStartBaseline'] != 0]
``` python
df.loc[df['NumberCompleted'] == 7].drop_duplicates(subset='CitizenId')
df = fm.assign_number_completed(df)
```
%% Output
CitizenId PatientId Sex Age NumberScreening StartDate \\n1562 4236583454361 40703 1 84 14 2018-11-06 \n\n EndDate LastStatusDate NumberWeeks MeanEvaluation ... \\n1562 2018-12-07 2018-12-07 4.43 4.2 ... \n\n PhysicsStartReason PhysicsEnd PhysicsDifference PhysicsReason \\n1562 Ingen 48.0 20.0 Ingen \n\n PhysicsIndicator RehabIndicator \\n1562 1 2.07 \n\n Exercises NumberExercises \\n1562 ['303451', '303453', '303452', '303454', '3034... 8 \n\n LastStatus NumberCompleted \n1562 Significantprogress 7.0 \n\n[1 rows x 54 columns]
%% Cell type:code id: tags:
``` python
df.loc[df['CitizenId'] == 4236583454361]
print('Number of citizens: ', len(df.drop_duplicates(subset='CitizenId')), '\n')
for i in range (0,8):
print(f'Completed {i}th times: ', len(df.loc[df['NumberCompleted'] == i].drop_duplicates(subset='CitizenId')))
```
%% Output
CitizenId PatientId Sex Age NumberScreening StartDate \\n1549 4236583454361 40703 1 82 1 2016-11-23 \n1550 4236583454361 40703 1 82 2 2016-11-23 \n1551 4236583454361 40703 1 83 3 2017-01-04 \n1552 4236583454361 40703 1 83 4 2017-09-28 \n1553 4236583454361 40703 1 83 5 2017-10-23 \n1554 4236583454361 40703 1 83 6 2017-12-14 \n1555 4236583454361 40703 1 84 7 2018-02-01 \n1556 4236583454361 40703 1 84 8 2018-03-05 \n1557 4236583454361 40703 1 84 9 2018-04-09 \n1558 4236583454361 40703 1 84 10 2018-05-31 \n1559 4236583454361 40703 1 84 11 2018-07-02 \n1560 4236583454361 40703 1 84 12 2018-08-06 \n1561 4236583454361 40703 1 84 13 2018-10-01 \n1562 4236583454361 40703 1 84 14 2018-11-06 \n\n EndDate LastStatusDate NumberWeeks MeanEvaluation ... \\n1549 2016-11-23 2020-08-14 0.00 2.0 ... \n1550 2017-01-04 2020-08-14 6.00 2.4 ... \n1551 2017-09-25 2017-09-25 37.71 2.0 ... \n1552 2017-10-23 2017-10-23 3.57 3.6 ... \n1553 2017-12-14 2017-10-23 7.43 3.6 ... \n1554 2018-02-01 2018-02-01 7.00 3.3 ... \n1555 2018-03-05 2018-02-01 4.57 3.1 ... \n1556 2018-04-09 2020-08-14 5.00 4.0 ... \n1557 2018-05-31 2018-05-29 7.43 3.5 ... \n1558 2018-07-02 2020-08-14 4.57 3.9 ... \n1559 2018-08-06 2018-08-06 5.00 3.6 ... \n1560 2018-10-01 2018-08-06 8.00 4.0 ... \n1561 2018-11-06 2020-08-14 5.14 3.7 ... \n1562 2018-12-07 2018-12-07 4.43 4.2 ... \n\n PhysicsStartReason PhysicsEnd PhysicsDifference \\n1549 Ingen 17.0 0.0 \n1550 Ingen 13.0 -4.0 \n1551 Ingen 16.0 3.0 \n1552 Ingen 42.0 26.0 \n1553 Ingen 29.0 -13.0 \n1554 Andet 29.0 0.0 \n1555 Ingen 27.0 -2.0 \n1556 Manglende motivation 27.0 0.0 \n1557 Ingen 29.0 2.0 \n1558 Ingen 25.0 -4.0 \n1559 Ingen forklaring 31.0 6.0 \n1560 Ingen 25.0 -6.0 \n1561 Ingen forklaring 28.0 3.0 \n1562 Ingen 48.0 20.0 \n\n PhysicsReason PhysicsIndicator RehabIndicator \\n1549 Ingen 0 1.76 \n1550 Ingen 0 1.76 \n1551 Ingen 1 3.08 \n1552 Ingen 1 4.31 \n1553 Andet 0 1.33 \n1554 Ingen 0 1.90 \n1555 Manglende motivation 0 1.28 \n1556 Ingen 0 1.74 \n1557 Ingen 1 1.30 \n1558 Ingen forklaring 0 1.03 \n1559 Ingen 1 3.52 \n1560 Ingen forklaring 0 1.16 \n1561 Ingen 1 2.00 \n1562 Ingen 1 2.07 \n\n Exercises NumberExercises \\n1549 ['257883', '257884', '257885', '257886'] 4 \n1550 ['257883', '257884', '257885', '257886'] 4 \n1551 ['260933', '260934', '260935', '260936'] 4 \n1552 ['274332', '274333', '274336', '274334', '2743... 7 \n1553 ['275394', '275396', '275398', '275395', '2753... 8 \n1554 ['278311', '278312', '278313', '278314', '2783... 7 \n1555 ['281129', '281130', '281131', '281132', '2811... 7 \n1556 ['283143', '283144', '283145', '283146', '2831... 7 \n1557 ['285287', '285288', '285289', '285290', '2852... 7 \n1558 ['289757', '289758', '289759', '289760', '2897... 7 \n1559 ['291949', '291950', '291951', '291952', '2919... 9 \n1560 ['293890', '293891', '293893', '293896', '2938... 7 \n1561 ['299590', '299591', '299592', '299593', '2995... 9 \n1562 ['303451', '303453', '303452', '303454', '3034... 8 \n\n LastStatus NumberCompleted \n1549 None 0.0 \n1550 None 1.0 \n1551 Active 1.0 \n1552 SignificantProgress 2.0 \n1553 SignificantProgress 2.0 \n1554 SignificantProgress 3.0 \n1555 SignificantProgress 3.0 \n1556 None 4.0 \n1557 ReActivated 4.0 \n1558 None 5.0 \n1559 SignificantProgress 5.0 \n1560 SignificantProgress 6.0 \n1561 None 6.0 \n1562 Significantprogress 7.0 \n\n[14 rows x 54 columns]
Number of citizens: 420 \n\nCompleted 0th times: 420\nCompleted 1th times: 239\nCompleted 2th times: 99\nCompleted 3th times: 57\nCompleted 4th times: 28\nCompleted 5th times: 11\nCompleted 6th times: 3\nCompleted 7th times: 1\n
......
......@@ -17,18 +17,17 @@ def clean_devices(hu):
:param hu: The assistive device table as a pd.DataFrame
"""
# sort the dataframe by lend date
# Sort the dataframe by lend date
hu = hu.reset_index().sort_values(by=[cfg.CITIZEN_ID, cfg.LEND_DATE]).drop(['index'], axis=1)
# Columns to sort filter for dublicates
# Columns to sort filter for duplicates
cols = [cfg.CITIZEN_ID, cfg.DEV_ISO_CLASS]
# Marks all duplicates of CitizenId, DevHMINumber, DevISOClass keep the first and the last
mask_first = ~hu.duplicated(subset=cols, keep='first')
mask_last = ~hu.duplicated(subset=cols, keep='last')
# The LendDate for the first delivered device for every borger
# The LendDate for the first delivered device for every citizen
hu_first = hu[mask_first].loc[:, cols + [cfg.LEND_DATE]]
# The ReturnDate for the last delivered
......
......@@ -124,7 +124,7 @@ def init_clusters():
#find the number of times a device appears in a list of devices
device_sum = lod.sum(axis=0)
# identify the devices that appears less than the threshold (quantile 15% ≈ 4 times)
# identify the devices that appears less than the threshold (quantile 15% ≈ 4 times)
thr = device_sum.quantile(0.15)
device_sum_thr = device_sum >= thr
iks_freq = device_sum_thr.index.where(device_sum_thr).dropna().values
......
......@@ -24,10 +24,13 @@ EXTERNAL_DATA_DIR = Path.joinpath(ROOT_DIR, 'data/external')
GENERAL_FEATURES = ['Sex', 'Age', 'Cluster']
DEVICE_FEATURES = ['HasRollator',
'HasRaisedToiletSeat',
'HasShowerStool',
'DevicesUnique',
'DevicesCount']
'HasRaisedToiletSeat',
'HasEmergencySystem',
'HasSeatCushion',
'HasWheelchair',
'HasBedWithEngine',
'ATS']
COMPLETES_FEATURES = ['NumberWeeksSum', 'NumberTrainingSum']
......