Commit d5c5da59 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved api, ats names, added falllong feature

parent 844d0d6f
Pipeline #47799 failed with stage
in 3 minutes and 28 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
%% Cell type:code id: tags:
 
``` python
import pandas as pd
import numpy as np
import datetime as dt
import tools.feature_maker as fm
import tools.preprocessor as preprocessor
import matplotlib.pyplot as plt
import config as cfg
from pathlib import Path
 
pd.reset_option('^display.', silent=True)
df = pd.read_csv('../data/interim/screenings.csv', converters={'CitizenId': str})
 
print(f"Number of screenings: {len(df)}")
print(f"Number of citizens: {df.CitizenId.nunique()}")
 
df = fm.make_complete_feature(df)
df = fm.make_citizen_training(df)
df = fm.make_citizen_ats(df)
df = preprocessor.replace_ats_strings(df)
 
df_comp = df.loc[df['Complete'] == 1]
print(f"Number of citizens that completed: {len(df_comp)}")
 
df_fail = df.loc[df['Complete'] == 0]
print(f"Number of citizens that failed: {len(df_fail)}")
```
 
%% Output
 
Number of screenings: 3217
Number of citizens: 885
Number of citizens that completed: 1543
Number of citizens that failed: 601
 
%% Cell type:code id: tags:
 
``` python
df.loc[df['Complete'] == 0]
```
 
%% Output
 
index CitizenId Gender BirthYear NumberSplit NumberScreening \\n0 0 3810622973 0 31 0 0 \n1 1 5806703169 0 35 0 0 \n5 8 4420982563 1 49 0 0 \n8 13 3806883741 0 44 0 0 \n15 23 3610642969 0 32 2 0 \n... ... ... ... ... ... ... \n2122 3180 3010883085 0 44 0 0 \n2130 3193 3003042889 0 52 0 0 \n2141 3211 5403004571 1 50 0 0 \n2142 3212 4212803493 0 40 0 0 \n2143 3214 4208665171 1 33 0 0 \n\n StartDate EndDate NumberWeeks MeanEvaluation ... \\n0 01-06-2016 01-06-2016 14.43 0.0 ... \n1 25-06-2020 25-06-2020 2.00 4.0 ... \n5 31-08-2020 31-08-2020 0.00 3.0 ... \n8 10-09-2020 10-09-2020 0.00 6.0 ... \n15 28-06-2018 28-06-2018 0.00 5.0 ... \n... ... ... ... ... ... \n2122 16-04-2019 16-04-2019 0.57 2.0 ... \n2130 19-03-2018 19-03-2018 0.86 0.0 ... \n2141 10-10-2016 10-10-2016 0.00 0.0 ... \n2142 07-08-2020 07-08-2020 0.00 0.0 ... \n2143 06-04-2016 06-04-2016 0.00 0.0 ... \n\n StdEvaluationMean NumberTrainingWeekMean MeanTimeBetweenTrainingMean \\n0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 \n5 0.0 0.0 0.0 \n8 0.0 0.0 0.0 \n15 0.0 0.0 0.0 \n... ... ... ... \n2122 0.0 0.0 0.0 \n2130 0.0 0.0 0.0 \n2141 0.0 0.0 0.0 \n2142 0.0 0.0 0.0 \n2143 0.0 0.0 0.0 \n\n NumberCancelsSum MeanTimeBetweenCancelsMean MeanNumberCancelsWeekMean \\n0 0 0.0 0.0 \n1 0 0.0 0.0 \n5 0 0.0 0.0 \n8 0 0.0 0.0 \n15 0 0.0 0.0 \n... ... ... ... \n2122 0 0.0 0.0 \n2130 0 0.0 0.0 \n2141 0 0.0 0.0 \n2142 0 0.0 0.0 \n2143 0 0.0 0.0 \n\n NeedsMean PhysicsMean NumberExercisesMean NumberAtsMean \n0 29.0 13.0 4.0 12.00 \n1 19.0 26.0 5.0 9.00 \n5 47.0 27.0 3.0 18.00 \n8 12.0 41.0 9.0 28.00 \n15 7.0 64.0 8.0 6.67 \n... ... ... ... ... \n2122 41.0 38.0 6.0 5.00 \n2130 13.0 35.0 7.0 15.00 \n2141 19.0 20.0 5.0 24.00 \n2142 21.0 45.0 7.0 19.00 \n2143 29.0 70.0 7.0 4.00 \n\n[601 rows x 56 columns]
 
%% Cell type:code id: tags:
 
``` python
df_comp.head()
```
 
%% Output
 
index CitizenId Gender BirthYear NumberSplit NumberScreening \\n2 2 6216663229 0 33 0 0 \n3 5 6216663229 0 33 0 3 \n4 7 1424924457 0 46 0 0 \n6 10 2824621797 0 31 0 0 \n7 12 2824621797 0 31 0 2 \n\n StartDate EndDate NumberWeeks MeanEvaluation ... \\n2 14-02-2019 14-02-2019 0.00 2.0 ... \n3 14-03-2019 15-04-2019 4.57 3.3 ... \n4 20-02-2018 20-02-2018 9.71 0.0 ... \n6 11-05-2020 11-05-2020 0.00 0.0 ... \n7 16-06-2020 04-08-2020 10.00 5.6 ... \n\n StdEvaluationMean NumberTrainingWeekMean MeanTimeBetweenTrainingMean \\n2 0.0 0.0 0.00 \n3 0.8 2.2 3.56 \n4 0.0 0.0 0.00 \n6 0.0 0.0 0.00 \n7 0.8 1.0 8.17 \n\n NumberCancelsSum MeanTimeBetweenCancelsMean MeanNumberCancelsWeekMean \\n2 0 0.00 0.00 \n3 0 0.00 0.00 \n4 0 0.00 0.00 \n6 0 0.00 0.00 \n7 5 11.75 0.71 \n\n NeedsMean PhysicsMean NumberExercisesMean NumberAtsMean \n2 0.0 0.0 0.0 8.0 \n3 7.0 26.0 3.0 8.0 \n4 41.0 24.0 6.0 40.0 \n6 10.0 56.0 8.0 0.0 \n7 0.0 86.0 9.0 0.0 \n\n[5 rows x 56 columns]
 
%% Cell type:code id: tags:
 
``` python
df_fail.head()
```
 
%% Output
 
index CitizenId Gender BirthYear NumberSplit NumberScreening \\n0 0 3810622973 0 31 0 0 \n1 1 5806703169 0 35 0 0 \n5 8 4420982563 1 49 0 0 \n8 13 3806883741 0 44 0 0 \n15 23 3610642969 0 32 2 0 \n\n StartDate EndDate NumberWeeks MeanEvaluation ... \\n0 01-06-2016 01-06-2016 14.43 0.0 ... \n1 25-06-2020 25-06-2020 2.00 4.0 ... \n5 31-08-2020 31-08-2020 0.00 3.0 ... \n8 10-09-2020 10-09-2020 0.00 6.0 ... \n15 28-06-2018 28-06-2018 0.00 5.0 ... \n\n StdEvaluationMean NumberTrainingWeekMean MeanTimeBetweenTrainingMean \\n0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 \n5 0.0 0.0 0.0 \n8 0.0 0.0 0.0 \n15 0.0 0.0 0.0 \n\n NumberCancelsSum MeanTimeBetweenCancelsMean MeanNumberCancelsWeekMean \\n0 0 0.0 0.0 \n1 0 0.0 0.0 \n5 0 0.0 0.0 \n8 0 0.0 0.0 \n15 0 0.0 0.0 \n\n NeedsMean PhysicsMean NumberExercisesMean NumberAtsMean \n0 29.0 13.0 4.0 12.00 \n1 19.0 26.0 5.0 9.00 \n5 47.0 27.0 3.0 18.00 \n8 12.0 41.0 9.0 28.00 \n15 7.0 64.0 8.0 6.67 \n\n[5 rows x 56 columns]
 
%% Cell type:code id: tags:
 
``` python
df_comp.NumberExercisesMean
```
 
%% Output
 
2 0.0\n3 3.0\n4 6.0\n6 8.0\n7 9.0\n ... \n2136 8.0\n2137 8.0\n2138 8.0\n2139 8.0\n2140 8.0\nName: NumberExercisesMean, Length: 1543, dtype: float64
 
%% Cell type:code id: tags:
 
``` python
import seaborn as sns
def bar_plot(df, variable):
# get feature
var = df[variable]
# count number of categorical variable(value/sample)
varValue = var.value_counts()
 
# visualize
plt.figure()
plt.bar(varValue.index, varValue)
plt.xticks(varValue.index, varValue.index.values)
plt.ylabel("Frequency")
plt.title(variable)
file_name = f"Time series bar {variable}.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
 
def hist_plot(df, variable, bins):
var = df[variable]
varValue = var.value_counts()
plt.figure()
plt.hist(df[variable], bins)
plt.xlabel(variable)
plt.ylabel("Frequency")
plt.title("{} distribution".format(variable))
file_name = f"Time series histogram {variable}.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
 
df = pd.concat([df_comp, df_fail], axis=0)
category1 = ["Complete", "Gender", "NumberScreening"]
for c in category1:
bar_plot(df, c)
hist_plot(df, "NumberAtsMean", bins=50)
hist_plot(df, "NumberExercisesMean", bins=50)
hist_plot(df, "NeedsMean", bins=50)
hist_plot(df, "PhysicsMean", bins=50)
```
 
%% Output
 
 
 
 
 
 
 
 
%% Cell type:code id: tags:
 
``` python
import seaborn as sns
list1 = ["Gender", "BirthYear", "NeedsMean", "PhysicsMean", "Complete"]
fig, ax = plt.subplots(figsize=(10,4))
sns.heatmap(df[list1].corr(), annot = True, linewidths=.5, fmt = ".2f")
file_name = f"Time series heatmap.pdf"
ax.set_title('Feature correlation with Complete')
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
g = sns.factorplot(x = "Gender", y = "Complete", data = df, kind = "bar", size = 4)
g.set_ylabels("Complete Probability")
g.fig.suptitle('Complete probability given Gender')
file_name = f"Time series factorplot Gender Complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
g = sns.factorplot(x = "NumberExercises", y = "Complete", data = df.loc[df.NumberExercises > 0], kind = "bar", size = 5)
g.set_ylabels("Complete Probability")
g.fig.suptitle('Complete probability given NumberExercises')
file_name = f"Time series factorplot NumberExercises Complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
g = sns.factorplot(x = "NumberCancels", y = "Complete", data = df, kind = "bar", size = 5)
g.set_ylabels("Complete Probability")
g.fig.suptitle('Complete probability given NumberCancels')
file_name = f"Time series factorplot NumberCancels Complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
# Calculate statistics for exercises
print(df_comp.Exercises.apply(lambda x: len(x)/9).describe(), '\n')
print(df_fail.Exercises.apply(lambda x: len(x)/9).describe())
```
 
%% Output
 
count 1543.000000
mean 3.563693
std 1.198437
min 0.111111
25% 2.666667
50% 3.777778
75% 4.333333
max 4.888889
Name: Exercises, dtype: float64
count 601.000000
mean 3.363838
std 1.377327
min 0.111111
25% 2.666667
50% 3.777778
75% 4.333333
max 4.888889
Name: Exercises, dtype: float64
 
%% Cell type:code id: tags:
 
``` python
def get_ats_list(df):
all_ats = []
for ats_string in df.Ats:
for ats in ats_string.split(","):
all_ats.append(ats)
return all_ats
 
df_comp = df_comp[df_comp['Ats'].notnull()]
df_fail = df_fail[df_fail['Ats'].notnull()]
ats_completed = pd.Series(get_ats_list(df_comp))
ats_failed = pd.Series(get_ats_list(df_fail))
 
# Print top ATS
print(ats_completed.value_counts().head(10), "\n")
print(ats_failed.value_counts().head(10))
```
 
%% Output
 
120606 2541
093307 1574
222718 1378
043303 1147
091218 1069
122203 1026
043306 736
091203 704
181210 596
123103 561
dtype: int64
120606 973
093307 635
222718 616
043303 498
122203 442
091218 394
043306 314
091203 287
181210 259
123103 242
dtype: int64
 
%% Cell type:code id: tags:
 
``` python
# Calculate total number of ATS
print(ats_completed.value_counts()[1:].sum()/len(ats_completed))
print(ats_failed.value_counts()[1:].sum()/len(ats_failed))
```
 
%% Output
 
0.845719489981785
0.8596162169961045
 
%% Cell type:code id: tags:
 
``` python
top_ats_completed = ats_completed.value_counts()[10::-1]
top_ats_failed = ats_failed.value_counts()[10::-1]
 
plt.figure(figsize=(10,4))
top_ats_completed.plot(kind='barh')
plt.xlabel('Frequency')
plt.ylabel('Ats id')
plt.title('ATS frequency for citizens that complete')
file_name = f"Time series ATS frequency complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
 
plt.figure(figsize=(10,4))
top_ats_failed.plot(kind='barh')
plt.xlabel('Frequency')
plt.ylabel('Ats id')
plt.title('ATS frequency for citizens that do not complete')
file_name = f"Time series ATS frequency fail.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
 
%% Cell type:code id: tags:
 
``` python
grp_completed = df_comp.groupby(['CitizenId'])['MeanTimeBetweenTrainingMean'].sum()
grp_failed = df_fail.groupby(['CitizenId'])['MeanTimeBetweenTrainingMean'].sum()
plt.figure(figsize=(6,6))
plt.scatter(range(len(grp_completed)), grp_completed.values, color='b')
plt.scatter(range(len(grp_failed)), grp_failed.values, color='r')
plt.title('Citizens and MeanTimeBetweenTrainingMean')
plt.xlabel('Citizen id')
plt.ylabel('MeanTimeBetweenTrainingMean')
plt.yscale('symlog')
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
grp_completed = df_comp.groupby(['CitizenId'])['NumberTrainingWeekMean'].mean()
grp_failed = df_fail.groupby(['CitizenId'])['NumberTrainingWeekMean'].mean()
 
plt.figure(figsize=(6,6))
plt.scatter(range(len(grp_completed)), grp_completed.values, color='b')
plt.scatter(range(len(grp_failed)), grp_failed.values, color='r')
plt.title('Citizens and NumberTrainingWeekMean')
plt.xlabel('Citizen id')
plt.ylabel('NumberTrainingWeekMean')
plt.yscale('symlog')
```
 
%% Output