Commit a6d1e873 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

put ats and ex in a csv file

parent 3326a6e1
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'air'
copyright = '2021, Christian Marius Lillelund'
author = 'Christian Marius Lillelund'
# The full version, including alpha/beta/rc tags
release = '0.0.1'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'alabaster'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
\ No newline at end of file
......@@ -17,7 +17,7 @@ import sys
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
# sys.path.insert(0, os.path.abspath('.'))
sys.path.insert(0, os.path.abspath('..'))
# -- General configuration -----------------------------------------------------
......@@ -26,7 +26,11 @@ import sys
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = []
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
......@@ -120,7 +124,7 @@ html_theme = 'default'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_static_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
......
.. air documentation master file, created by
sphinx-quickstart.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
air
=======================================================
air documentation!
==============================================
Contents:
.. automodule:: tools.preprocessor
:members:
.. toctree::
:maxdepth: 2
getting-started
commands
:caption: Contents:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
* :ref:`search`
\ No newline at end of file
.. air documentation master file, created by
sphinx-quickstart on Wed Apr 28 11:13:45 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to air's documentation!
===============================
.. toctree::
:maxdepth: 2
:caption: Contents:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
%% Cell type:code id: tags:
 
``` python
```
import pandas as pd
import numpy as np
import datetime as dt
import tools.feature_maker as fm
import tools.preprocessor as preprocessor
import matplotlib.pyplot as plt
import config as cfg
from pathlib import Path
 
pd.reset_option('^display.', silent=True)
df = pd.read_csv('../data/interim/screenings.csv', converters={'CitizenId': str})
 
print(f"Number of screenings: {len(df)}")
print(f"Number of citizens: {df.CitizenId.nunique()}")
 
df = fm.make_complete_feature(df)
df = fm.make_citizen_training(df)
df = fm.make_citizen_ats(df)
df = preprocessor.replace_ats_strings(df)
 
df_comp = df.loc[df['Complete'] == 1]
print(f"Number of citizens that completed: {len(df_comp)}")
 
df_fail = df.loc[df['Complete'] == 0]
print(f"Number of citizens that failed: {len(df_fail)}")
```
 
%% Output
 
Number of screenings: 3217
Number of citizens: 885
Number of citizens that completed: 1543
Number of citizens that failed: 601
 
%% Cell type:code id: tags:
 
``` python
```
df.loc[df['Complete'] == 0]
```
 
%% Output
 
index CitizenId Gender BirthYear NumberSplit NumberScreening \\n0 0 3810622973 0 31 0 0 \n1 1 5806703169 0 35 0 0 \n5 8 4420982563 1 49 0 0 \n8 13 3806883741 0 44 0 0 \n15 23 3610642969 0 32 2 0 \n... ... ... ... ... ... ... \n2122 3180 3010883085 0 44 0 0 \n2130 3193 3003042889 0 52 0 0 \n2141 3211 5403004571 1 50 0 0 \n2142 3212 4212803493 0 40 0 0 \n2143 3214 4208665171 1 33 0 0 \n\n StartDate EndDate NumberWeeks MeanEvaluation ... \\n0 01-06-2016 01-06-2016 14.43 0.0 ... \n1 25-06-2020 25-06-2020 2.00 4.0 ... \n5 31-08-2020 31-08-2020 0.00 3.0 ... \n8 10-09-2020 10-09-2020 0.00 6.0 ... \n15 28-06-2018 28-06-2018 0.00 5.0 ... \n... ... ... ... ... ... \n2122 16-04-2019 16-04-2019 0.57 2.0 ... \n2130 19-03-2018 19-03-2018 0.86 0.0 ... \n2141 10-10-2016 10-10-2016 0.00 0.0 ... \n2142 07-08-2020 07-08-2020 0.00 0.0 ... \n2143 06-04-2016 06-04-2016 0.00 0.0 ... \n\n StdEvaluationMean NumberTrainingWeekMean MeanTimeBetweenTrainingMean \\n0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 \n5 0.0 0.0 0.0 \n8 0.0 0.0 0.0 \n15 0.0 0.0 0.0 \n... ... ... ... \n2122 0.0 0.0 0.0 \n2130 0.0 0.0 0.0 \n2141 0.0 0.0 0.0 \n2142 0.0 0.0 0.0 \n2143 0.0 0.0 0.0 \n\n NumberCancelsSum MeanTimeBetweenCancelsMean MeanNumberCancelsWeekMean \\n0 0 0.0 0.0 \n1 0 0.0 0.0 \n5 0 0.0 0.0 \n8 0 0.0 0.0 \n15 0 0.0 0.0 \n... ... ... ... \n2122 0 0.0 0.0 \n2130 0 0.0 0.0 \n2141 0 0.0 0.0 \n2142 0 0.0 0.0 \n2143 0 0.0 0.0 \n\n NeedsMean PhysicsMean NumberExercisesMean NumberAtsMean \n0 29.0 13.0 4.0 12.00 \n1 19.0 26.0 5.0 9.00 \n5 47.0 27.0 3.0 18.00 \n8 12.0 41.0 9.0 28.00 \n15 7.0 64.0 8.0 6.67 \n... ... ... ... ... \n2122 41.0 38.0 6.0 5.00 \n2130 13.0 35.0 7.0 15.00 \n2141 19.0 20.0 5.0 24.00 \n2142 21.0 45.0 7.0 19.00 \n2143 29.0 70.0 7.0 4.00 \n\n[601 rows x 56 columns]
 
%% Cell type:code id: tags:
 
``` python
```
df_comp.head()
```
 
%% Output
 
index CitizenId Gender BirthYear NumberSplit NumberScreening \\n2 2 6216663229 0 33 0 0 \n3 5 6216663229 0 33 0 3 \n4 7 1424924457 0 46 0 0 \n6 10 2824621797 0 31 0 0 \n7 12 2824621797 0 31 0 2 \n\n StartDate EndDate NumberWeeks MeanEvaluation ... \\n2 14-02-2019 14-02-2019 0.00 2.0 ... \n3 14-03-2019 15-04-2019 4.57 3.3 ... \n4 20-02-2018 20-02-2018 9.71 0.0 ... \n6 11-05-2020 11-05-2020 0.00 0.0 ... \n7 16-06-2020 04-08-2020 10.00 5.6 ... \n\n StdEvaluationMean NumberTrainingWeekMean MeanTimeBetweenTrainingMean \\n2 0.0 0.0 0.00 \n3 0.8 2.2 3.56 \n4 0.0 0.0 0.00 \n6 0.0 0.0 0.00 \n7 0.8 1.0 8.17 \n\n NumberCancelsSum MeanTimeBetweenCancelsMean MeanNumberCancelsWeekMean \\n2 0 0.00 0.00 \n3 0 0.00 0.00 \n4 0 0.00 0.00 \n6 0 0.00 0.00 \n7 5 11.75 0.71 \n\n NeedsMean PhysicsMean NumberExercisesMean NumberAtsMean \n2 0.0 0.0 0.0 8.0 \n3 7.0 26.0 3.0 8.0 \n4 41.0 24.0 6.0 40.0 \n6 10.0 56.0 8.0 0.0 \n7 0.0 86.0 9.0 0.0 \n\n[5 rows x 56 columns]
 
%% Cell type:code id: tags:
 
``` python
```
df_fail.head()
```
 
%% Output
 
index CitizenId Gender BirthYear NumberSplit NumberScreening \\n0 0 3810622973 0 31 0 0 \n1 1 5806703169 0 35 0 0 \n5 8 4420982563 1 49 0 0 \n8 13 3806883741 0 44 0 0 \n15 23 3610642969 0 32 2 0 \n\n StartDate EndDate NumberWeeks MeanEvaluation ... \\n0 01-06-2016 01-06-2016 14.43 0.0 ... \n1 25-06-2020 25-06-2020 2.00 4.0 ... \n5 31-08-2020 31-08-2020 0.00 3.0 ... \n8 10-09-2020 10-09-2020 0.00 6.0 ... \n15 28-06-2018 28-06-2018 0.00 5.0 ... \n\n StdEvaluationMean NumberTrainingWeekMean MeanTimeBetweenTrainingMean \\n0 0.0 0.0 0.0 \n1 0.0 0.0 0.0 \n5 0.0 0.0 0.0 \n8 0.0 0.0 0.0 \n15 0.0 0.0 0.0 \n\n NumberCancelsSum MeanTimeBetweenCancelsMean MeanNumberCancelsWeekMean \\n0 0 0.0 0.0 \n1 0 0.0 0.0 \n5 0 0.0 0.0 \n8 0 0.0 0.0 \n15 0 0.0 0.0 \n\n NeedsMean PhysicsMean NumberExercisesMean NumberAtsMean \n0 29.0 13.0 4.0 12.00 \n1 19.0 26.0 5.0 9.00 \n5 47.0 27.0 3.0 18.00 \n8 12.0 41.0 9.0 28.00 \n15 7.0 64.0 8.0 6.67 \n\n[5 rows x 56 columns]
 
%% Cell type:code id: tags:
 
``` python
```
df_comp.NumberExercisesMean
```
 
%% Output
 
2 0.0\n3 3.0\n4 6.0\n6 8.0\n7 9.0\n ... \n2136 8.0\n2137 8.0\n2138 8.0\n2139 8.0\n2140 8.0\nName: NumberExercisesMean, Length: 1543, dtype: float64
 
%% Cell type:code id: tags:
 
``` python
```
import seaborn as sns
def bar_plot(df, variable):
# get feature
var = df[variable]
# count number of categorical variable(value/sample)
varValue = var.value_counts()
 
# visualize
plt.figure()
plt.bar(varValue.index, varValue)
plt.xticks(varValue.index, varValue.index.values)
plt.ylabel("Frequency")
plt.title(variable)
file_name = f"Time series bar {variable}.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
 
def hist_plot(df, variable, bins):
var = df[variable]
varValue = var.value_counts()
plt.figure()
plt.hist(df[variable], bins)
plt.xlabel(variable)
plt.ylabel("Frequency")
plt.title("{} distribution".format(variable))
file_name = f"Time series histogram {variable}.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
 
df = pd.concat([df_comp, df_fail], axis=0)
category1 = ["Complete", "Gender", "NumberScreening"]
for c in category1:
bar_plot(df, c)
hist_plot(df, "NumberAtsMean", bins=50)
hist_plot(df, "NumberExercisesMean", bins=50)
hist_plot(df, "NeedsMean", bins=50)
hist_plot(df, "PhysicsMean", bins=50)
```
 
%% Output
 
 
 
 
 
 
 
 
%% Cell type:code id: tags:
 
``` python
```
import seaborn as sns
list1 = ["Gender", "BirthYear", "NeedsMean", "PhysicsMean", "Complete"]
fig, ax = plt.subplots(figsize=(10,4))
sns.heatmap(df[list1].corr(), annot = True, linewidths=.5, fmt = ".2f")
file_name = f"Time series heatmap.pdf"
ax.set_title('Feature correlation with Complete')
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
```
g = sns.factorplot(x = "Gender", y = "Complete", data = df, kind = "bar", size = 4)
g.set_ylabels("Complete Probability")
g.fig.suptitle('Complete probability given Gender')
file_name = f"Time series factorplot Gender Complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
```
g = sns.factorplot(x = "NumberExercises", y = "Complete", data = df.loc[df.NumberExercises > 0], kind = "bar", size = 5)
g.set_ylabels("Complete Probability")
g.fig.suptitle('Complete probability given NumberExercises')
file_name = f"Time series factorplot NumberExercises Complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
```
g = sns.factorplot(x = "NumberCancels", y = "Complete", data = df, kind = "bar", size = 5)
g.set_ylabels("Complete Probability")
g.fig.suptitle('Complete probability given NumberCancels')
file_name = f"Time series factorplot NumberCancels Complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
%% Cell type:code id: tags:
 
``` python
```
# Calculate statistics for exercises
print(df_comp.Exercises.apply(lambda x: len(x)/9).describe(), '\n')
print(df_fail.Exercises.apply(lambda x: len(x)/9).describe())
```
 
%% Output
 
count 1543.000000
mean 3.563693
std 1.198437
min 0.111111
25% 2.666667
50% 3.777778
75% 4.333333
max 4.888889
Name: Exercises, dtype: float64
count 601.000000
mean 3.363838
std 1.377327
min 0.111111
25% 2.666667
50% 3.777778
75% 4.333333
max 4.888889
Name: Exercises, dtype: float64
 
%% Cell type:code id: tags:
 
``` python
```
def get_ats_list(df):
all_ats = []
for ats_string in df.Ats:
for ats in ats_string.split(","):
all_ats.append(ats)
return all_ats
 
df_comp = df_comp[df_comp['Ats'].notnull()]
df_fail = df_fail[df_fail['Ats'].notnull()]
ats_completed = pd.Series(get_ats_list(df_comp))
ats_failed = pd.Series(get_ats_list(df_fail))
 
# Print top ATS
print(ats_completed.value_counts().head(10), "\n")
print(ats_failed.value_counts().head(10))
```
 
%% Output
 
120606 2541
093307 1574
222718 1378
043303 1147
091218 1069
122203 1026
043306 736
091203 704
181210 596
123103 561
dtype: int64
120606 973
093307 635
222718 616
043303 498
122203 442
091218 394
043306 314
091203 287
181210 259
123103 242
dtype: int64
 
%% Cell type:code id: tags:
 
``` python
```
# Calculate total number of ATS
print(ats_completed.value_counts()[1:].sum()/len(ats_completed))
print(ats_failed.value_counts()[1:].sum()/len(ats_failed))
```
 
%% Output
 
0.845719489981785
0.8596162169961045
 
%% Cell type:code id: tags:
 
``` python
```
top_ats_completed = ats_completed.value_counts()[10::-1]
top_ats_failed = ats_failed.value_counts()[10::-1]
 
plt.figure(figsize=(10,4))
top_ats_completed.plot(kind='barh')
plt.xlabel('Frequency')
plt.ylabel('Ats id')
plt.title('ATS frequency for citizens that complete')
file_name = f"Time series ATS frequency complete.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
 
plt.figure(figsize=(10,4))
top_ats_failed.plot(kind='barh')
plt.xlabel('Frequency')
plt.ylabel('Ats id')
plt.title('ATS frequency for citizens that do not complete')
file_name = f"Time series ATS frequency fail.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
 
%% Output
 
 
 
%% Cell type:code id: tags:
 
``` python
```
grp_completed = df_comp.groupby(['CitizenId'])['MeanTimeBetweenTrainingMean'].sum()
grp_failed = df_fail.groupby(['CitizenId'])['MeanTimeBetweenTrainingMean'].sum()
plt.figure(figsize=(6,6))
plt.scatter(range(len(grp_completed)), grp_completed.values, color='b')
plt.scatter(range(len(grp_failed)), grp_failed.values, color='r')
plt.title('Citizens and MeanTimeBetweenTrainingMean')
plt.xlabel('Citizen id')
plt.ylabel('MeanTimeBetweenTrainingMean')
plt.yscale('symlog')
```
 
%% Output