Commit 7a47d581 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

Included more data about ats

parent 05e66547
Pipeline #25608 failed with stage
in 2 minutes and 48 seconds
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import src.models.feature_maker as fm
import src.models.preprocessor as pp
import src.config as cfg
pd.reset_option('^display.', silent=True)
pd.set_option('display.max_columns', 100)
df = pd.read_csv('../data/interim/timeseries.csv')
df = df.drop(['NeedsStartReason', 'NeedsReason', 'PhysicsStartReason', 'PhysicsReason'], axis=1)
df['NumberWeeksSum'] = df.groupby('CitizenId')['NumberWeeks'].transform(pd.Series.cumsum)
df['NumberTrainingSum'] = df.groupby('CitizenId')['NumberTraining'].transform(pd.Series.cumsum)
df['NeedsStartBaseline'] = df.groupby('CitizenId')["NeedsStart"].transform('first')
df = df[df['NeedsStartBaseline'] != 0]
df_completed = df.loc[(df['NumberWeeksSum'] >= 8) & (df['NumberTrainingSum'] >= 7)] \
.drop_duplicates(subset='CitizenId').reset_index(drop=True)
df_not_completed = df.drop(df[df.CitizenId.isin(df_completed.CitizenId)].index) \
.drop_duplicates(subset='CitizenId', keep='last').reset_index(drop=True)
df_improved = df_completed.loc[(df_completed['NeedsStartBaseline'] - df_completed['NeedsEnd']) /
df_completed['NeedsStartBaseline'] >= 0.1]
df_not_improved = df_completed.drop(df_completed[df_completed.CitizenId.isin(df_improved.CitizenId)].index)
unrelated_cols = ['Exercises', 'LastStatus', 'LastStatusDate', 'DevicesCount', 'DevicesUnique']
df_completed_prep = df_completed.drop(unrelated_cols, axis=1)
df_not_completed_prep = df_not_completed.drop(unrelated_cols, axis=1)
df_improved_prep = df_improved.drop(unrelated_cols, axis=1)
df_not_improved_prep = df_not_improved.drop(unrelated_cols, axis=1)
profile_comp = ProfileReport(df_completed_prep)
profile_fail = ProfileReport(df_not_completed_prep)
profile_improved = ProfileReport(df_improved_prep)
profile_not_improved = ProfileReport(df_not_improved_prep)
profile_comp.to_file(f"..\\reports\profile_completed.html")
profile_fail.to_file(f"..\\reports\profile_fail.html")
profile_improved.to_file(f"..\\reports\profile_improved.html")
profile_not_improved.to_file(f"..\\reports\profile_not_improved.html")
# Preprocessing
df = fm.make_citizen_features(df)
df_completed = df.loc[(df['NumberWeeksSum'] >= 8)
& (df['NumberTrainingSum'] >= 7)].drop_duplicates(subset='CitizenId')
df_failed = df.drop(df[df.CitizenId.isin(df_completed.CitizenId)].index) \
.drop_duplicates(subset='CitizenId', keep='last')
df_completed = df_completed[['CitizenId'] + cfg.GENERAL_FEATURES + cfg.CUMULATIVE_FEATURES + cfg.COMPLETES_FEATURES]
df_failed = df_failed[['CitizenId'] + cfg.GENERAL_FEATURES + cfg.CUMULATIVE_FEATURES + cfg.COMPLETES_FEATURES]
profile_completed = ProfileReport(df_completed)
profile_failed = ProfileReport(df_failed)
profile_completed.to_file(f"..\\reports\profile_completed.html")
profile_failed.to_file(f"..\\reports\profile_failed.html")
```
%% Output
Summarize dataset: 100%|██████████| 59/59 [01:56<00:00, 1.97s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:09<00:00, 9.17s/it]\nRender HTML: 100%|██████████| 1/1 [00:15<00:00, 15.99s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 3.72it/s]\nSummarize dataset: 100%|██████████| 59/59 [01:53<00:00, 1.92s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:10<00:00, 10.56s/it]\nRender HTML: 100%|██████████| 1/1 [00:13<00:00, 13.27s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 4.43it/s]\nSummarize dataset: 100%|██████████| 60/60 [01:52<00:00, 1.88s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.62s/it]\nRender HTML: 100%|██████████| 1/1 [00:12<00:00, 12.80s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 5.16it/s]\nSummarize dataset: 100%|██████████| 60/60 [02:05<00:00, 2.09s/it, Completed]\nGenerate report structure: 100%|██████████| 1/1 [00:09<00:00, 9.75s/it]\nRender HTML: 100%|██████████| 1/1 [00:13<00:00, 13.34s/it]\nExport report to file: 100%|██████████| 1/1 [00:00<00:00, 4.76it/s]\n
Traceback (most recent call last):
File "C:\Users\cml\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3343, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-1-e209fd223d87>", line 6, in <module>
import src.models.feature_maker as fm
File "c:\users\cml\desktop\air\src\models\feature_maker.py", line 49
/ row['NeedsStartBaseline']
^
IndentationError: unexpected indent
%% Cell type:code id: tags:
``` python
```
......
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X[X.columns.drop(list(X.filter(regex='DevicesUnique')))]
X = X[X.columns.drop(list(X.filter(regex='DevicesCount')))]
X = X.drop(['LastStatusDate'], axis=1)
X = X.reset_index(drop=True)
return X
pipeline = Pipeline([
('drop_cols', DropColumnsTransformer()),
('scaler', StandardScaler()),
('clf', 'passthrough')])
N_NEIGHBORS = [2, 4, 6, 8, 10, 15, 20, 50, 100, 500]
MAX_DEPTH = [2, 4, 6, 8, 10, 12, 14, 16, 25, 50]
N_ESTIMATORS = [100, 150, 250, 300, 350, 400, 500, 600, 700, 1000, 2000, 5000]
MAX_FEATURES = [4, 5, 6, 8, 10, 50, 100, 150, 200]
MIN_SAMPLES_LEAF = [5, 10, 15, 20, 25, 30, 35, 40]
C = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e5, 1e10]
PENALTY = ['none', 'l2']
MAX_ITER = [100, 500, 1000]
KERNEL = ['linear', 'poly', 'rbf']
VAR_SMOOTHING = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-2, 1e-1, 1e2, 1e5]
param_grid = [
{
'clf': [KNeighborsClassifier()],
'clf__n_neighbors': N_NEIGHBORS
},
{
'clf': [DecisionTreeClassifier(random_state=0)],
'clf__max_depth': MAX_DEPTH
},
{
'clf': [RandomForestClassifier(random_state=0)],
'clf__max_depth': MAX_DEPTH,
'clf__n_estimators': N_ESTIMATORS,
'clf__max_features': MAX_FEATURES,
'clf__min_samples_leaf': MIN_SAMPLES_LEAF
},
{
'clf': [LogisticRegression(random_state=0)],
'clf__C': C,
'clf__penalty': PENALTY
},
{
'clf': [SVC(random_state=0)],
'clf__kernel': KERNEL,
'clf__C': C
},
{
'clf': [GaussianNB()],
'clf__var_smoothing': VAR_SMOOTHING
}
]
grid = RandomizedSearchCV(pipeline, refit=False, cv=5, n_jobs=-1,
n_iter=50, param_distributions=param_grid, scoring="f1")
X = pd.read_csv(f'../data/processed/X_relative.csv')
y = pd.read_csv(f'../data/processed/y_relative.csv')
X = pd.read_csv(f'../data/processed/X_completes.csv')
y = pd.read_csv(f'../data/processed/y_completes.csv')
res = grid.fit(X, y)
def report(results, n_top=10):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})"
.format(results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
report(res.cv_results_)
```
%% Output
Model with rank: 1
Mean validation score: 0.489 (std: 0.063)
Parameters: {'clf__n_estimators': 5000, 'clf__min_samples_leaf': 15, 'clf__max_features': 8, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 2
Mean validation score: 0.487 (std: 0.062)
Parameters: {'clf__n_estimators': 600, 'clf__min_samples_leaf': 30, 'clf__max_features': 6, 'clf__max_depth': 12, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 3
Mean validation score: 0.485 (std: 0.075)
Parameters: {'clf__n_estimators': 400, 'clf__min_samples_leaf': 35, 'clf__max_features': 6, 'clf__max_depth': 2, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 4
Mean validation score: 0.483 (std: 0.066)
Parameters: {'clf__n_estimators': 2000, 'clf__min_samples_leaf': 30, 'clf__max_features': 6, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 5
Mean validation score: 0.482 (std: 0.076)
Parameters: {'clf__n_estimators': 2000, 'clf__min_samples_leaf': 25, 'clf__max_features': 4, 'clf__max_depth': 16, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 6
Mean validation score: 0.481 (std: 0.079)
Parameters: {'clf__n_estimators': 1000, 'clf__min_samples_leaf': 15, 'clf__max_features': 6, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 7
Mean validation score: 0.477 (std: 0.078)
Parameters: {'clf__n_estimators': 100, 'clf__min_samples_leaf': 15, 'clf__max_features': 4, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 8
Mean validation score: 0.475 (std: 0.055)
Parameters: {'clf__n_estimators': 400, 'clf__min_samples_leaf': 15, 'clf__max_features': 4, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 9
Mean validation score: 0.474 (std: 0.072)
Parameters: {'clf__n_estimators': 350, 'clf__min_samples_leaf': 35, 'clf__max_features': 5, 'clf__max_depth': 10, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 10
Mean validation score: 0.472 (std: 0.043)
Parameters: {'clf__n_estimators': 150, 'clf__min_samples_leaf': 5, 'clf__max_features': 4, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=0, verbose=0,
warm_start=False)}
Model with rank: 1\nMean validation score: 0.851 (std: 0.041)\nParameters: {'clf__n_estimators': 500, 'clf__min_samples_leaf': 25, 'clf__max_features': 8, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 2\nMean validation score: 0.851 (std: 0.041)\nParameters: {'clf__n_estimators': 5000, 'clf__min_samples_leaf': 10, 'clf__max_features': 4, 'clf__max_depth': 25, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 3\nMean validation score: 0.850 (std: 0.047)\nParameters: {'clf__n_estimators': 500, 'clf__min_samples_leaf': 5, 'clf__max_features': 4, 'clf__max_depth': 8, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 4\nMean validation score: 0.850 (std: 0.043)\nParameters: {'clf__n_estimators': 300, 'clf__min_samples_leaf': 5, 'clf__max_features': 6, 'clf__max_depth': 4, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 5\nMean validation score: 0.849 (std: 0.036)\nParameters: {'clf__n_estimators': 2000, 'clf__min_samples_leaf': 40, 'clf__max_features': 8, 'clf__max_depth': 4, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 6\nMean validation score: 0.848 (std: 0.048)\nParameters: {'clf__n_estimators': 300, 'clf__min_samples_leaf': 15, 'clf__max_features': 5, 'clf__max_depth': 4, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 7\nMean validation score: 0.848 (std: 0.050)\nParameters: {'clf__n_estimators': 250, 'clf__min_samples_leaf': 10, 'clf__max_features': 5, 'clf__max_depth': 6, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 8\nMean validation score: 0.847 (std: 0.053)\nParameters: {'clf__n_estimators': 100, 'clf__min_samples_leaf': 15, 'clf__max_features': 6, 'clf__max_depth': 25, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 9\nMean validation score: 0.846 (std: 0.031)\nParameters: {'clf__n_estimators': 150, 'clf__min_samples_leaf': 30, 'clf__max_features': 4, 'clf__max_depth': 4, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 10\nMean validation score: 0.845 (std: 0.037)\nParameters: {'clf__n_estimators': 300, 'clf__min_samples_leaf': 40, 'clf__max_features': 6, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\n
......