Commit 0b5d9c96 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

Changed feature names and made eda

parent 13eddbc5
Pipeline #25063 failed with stage
in 2 minutes and 28 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
%% Cell type:code id: tags:
```
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
class DropColumnsTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X[X.columns.drop(list(X.filter(regex='DevicesUnique')))]
X = X[X.columns.drop(list(X.filter(regex='DevicesCount')))]
X = X.drop(['LastStatusDate'], axis=1)
X = X.reset_index(drop=True)
return X
pipeline = Pipeline([
('drop_cols', DropColumnsTransformer()),
('scaler', StandardScaler()),
('clf', 'passthrough')])
N_NEIGHBORS = [2, 4, 6, 8, 10, 15, 20, 50, 100, 500]
MAX_DEPTH = [2, 4, 6, 8, 10, 12, 14, 16, 25, 50]
N_ESTIMATORS = [100, 150, 250, 300, 350, 400, 500, 600, 700, 1000, 2000, 5000]
MAX_FEATURES = [4, 5, 6, 8, 10, 50, 100, 150, 200]
MIN_SAMPLES_LEAF = [5, 10, 15, 20, 25, 30, 35, 40]
C = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e5, 1e10]
PENALTY = ['none', 'l2']
MAX_ITER = [100, 500, 1000]
KERNEL = ['linear', 'poly', 'rbf']
VAR_SMOOTHING = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-2, 1e-1, 1e2, 1e5]
param_grid = [
{
'clf': [KNeighborsClassifier()],
'clf__n_neighbors': N_NEIGHBORS
},
{
'clf': [DecisionTreeClassifier(random_state=0)],
'clf__max_depth': MAX_DEPTH
},
{
'clf': [RandomForestClassifier(random_state=0)],
'clf__max_depth': MAX_DEPTH,
'clf__n_estimators': N_ESTIMATORS,
'clf__max_features': MAX_FEATURES,
'clf__min_samples_leaf': MIN_SAMPLES_LEAF
},
{
'clf': [LogisticRegression(random_state=0)],
'clf__C': C,
'clf__penalty': PENALTY
},
{
'clf': [SVC(random_state=0)],
'clf__kernel': KERNEL,
'clf__C': C
},
{
'clf': [GaussianNB()],
'clf__var_smoothing': VAR_SMOOTHING
}
]
grid = RandomizedSearchCV(pipeline, refit=False, cv=5, n_jobs=-1,
n_iter=50, param_distributions=param_grid, scoring="f1")
X = pd.read_csv(f'../data/processed/X_cv_ready_relative.csv')
y = pd.read_csv(f'../data/processed/y_cv_ready_relative.csv')
X = pd.read_csv(f'../data/processed/X_relative.csv')
y = pd.read_csv(f'../data/processed/y_relative.csv')
res = grid.fit(X, y)
def report(results, n_top=10):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})"
.format(results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
report(res.cv_results_)
```
%% Output
Model with rank: 1\nMean validation score: 0.489 (std: 0.063)\nParameters: {'clf__n_estimators': 5000, 'clf__min_samples_leaf': 15, 'clf__max_features': 8, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 2\nMean validation score: 0.487 (std: 0.062)\nParameters: {'clf__n_estimators': 600, 'clf__min_samples_leaf': 30, 'clf__max_features': 6, 'clf__max_depth': 12, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 3\nMean validation score: 0.485 (std: 0.075)\nParameters: {'clf__n_estimators': 400, 'clf__min_samples_leaf': 35, 'clf__max_features': 6, 'clf__max_depth': 2, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 4\nMean validation score: 0.483 (std: 0.066)\nParameters: {'clf__n_estimators': 2000, 'clf__min_samples_leaf': 30, 'clf__max_features': 6, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 5\nMean validation score: 0.482 (std: 0.076)\nParameters: {'clf__n_estimators': 2000, 'clf__min_samples_leaf': 25, 'clf__max_features': 4, 'clf__max_depth': 16, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 6\nMean validation score: 0.481 (std: 0.079)\nParameters: {'clf__n_estimators': 1000, 'clf__min_samples_leaf': 15, 'clf__max_features': 6, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 7\nMean validation score: 0.477 (std: 0.078)\nParameters: {'clf__n_estimators': 100, 'clf__min_samples_leaf': 15, 'clf__max_features': 4, 'clf__max_depth': 50, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 8\nMean validation score: 0.475 (std: 0.055)\nParameters: {'clf__n_estimators': 400, 'clf__min_samples_leaf': 15, 'clf__max_features': 4, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 9\nMean validation score: 0.474 (std: 0.072)\nParameters: {'clf__n_estimators': 350, 'clf__min_samples_leaf': 35, 'clf__max_features': 5, 'clf__max_depth': 10, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\nModel with rank: 10\nMean validation score: 0.472 (std: 0.043)\nParameters: {'clf__n_estimators': 150, 'clf__min_samples_leaf': 5, 'clf__max_features': 4, 'clf__max_depth': 14, 'clf': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=0, verbose=0,\n warm_start=False)}\n\n
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
......@@ -23,9 +23,8 @@ EXTERNAL_DATA_DIR = Path.joinpath(ROOT_DIR, 'data/external')
GENERAL_SUBSET = ["Age",
'NumberATsRunning',
'Sex', # TODO: Left out RehabIndicator
'NeedsStart',
'PhysicsStart']
'Sex',
'RehabIndicator']
DEVICE_SUBSET = ['HasRollator',
'HasRaisedToiletSeat',
......@@ -34,16 +33,24 @@ DEVICE_SUBSET = ['HasRollator',
'DevicesCount',
'Cluster']
TRAINING_INFO = ['nWeeks',
TRAINING_INFO = ['NumberWeeks',
'MeanEvaluation',
'StdEvaluation',
'MinEvaluation',
'MaxEvaluation',
'nTrainingPrWeek',
'nTrainingPrWeekMax',
'nTrainingPrWeekMin',
'nWeeksWithTrainings',
'nWeeksWithoutTrainings']
'NumberTraining',
'NumberTrainingPerWeek',
'NumberTrainingPerWeekMax',
'NumberTrainingPerWeekMin',
'NumberWeeksWithTraining',
'TimeBetweenTrainingAvg',
'NumberWeeksNoTraining',
'NumberCancels',
'TimeBetweenCancelsAvg',
'NumberCancelsPerWeekAvg',
'NumberCancelsPerWeekMin',
'NumberCancelsPerWeekMax',
'NumberExercises']
OBJECT_COLS = ['HasRollator',
'HasRaisedToiletSeat',
......@@ -67,7 +74,7 @@ TWO_SCREENINGS_SUBSET = ['StartYear',
'nTrainingPrWeek',
'nTrainingPrWeekMax',
'nTrainingPrWeekMin',
'TimeBetweenTrainingsAvg',
'TimeBetweenTrainingAvg',
'nCancellationsPrWeekAVG',
'nCancellationsPrWeekMax',
'nCancellationsPrWeekMin',
......
......@@ -40,7 +40,7 @@ def create_window_features(id, data):
# Create a data window for each citizen's screenings
id_features = pd.DataFrame()
for i, screening in enumerate(citizen_data.sv.itertuples(index=False), start=1):
for number, screening in enumerate(citizen_data.sv.itertuples(index=False), start=1):
start_date = pre_screening.ScreeningDate
end_date = screening.ScreeningDate
......@@ -50,7 +50,7 @@ def create_window_features(id, data):
window_features['Sex'] = get_sex(citizen_data.sv)
window_features['Age'] = get_start_year(pre_screening) \
- get_birth_year(citizen_data.sv)
window_features['ScreeningNo'] = i
window_features['ScreeningNumber'] = number
# Get time dependent window data
tdw, tcw, ssw, huw = get_window_data(citizen_data.td, citizen_data.tc,
......@@ -64,39 +64,42 @@ def create_window_features(id, data):
window_features['LastStatusDate'] = get_last_status_date(ssw, '%Y-%m-%d %H:%M:%S')
n_weeks = get_interval_length(start_date, end_date)
window_features['nWeeks'] = n_weeks
window_features['NumberWeeks'] = n_weeks
window_features['MeanEvaluation'] = get_mean_evaluation(tdw)
window_features['StdEvaluation'] = get_std_evaluation(tdw)
window_features['MinEvaluation'] = get_min_evaluation(tdw)
window_features['MaxEvaluation'] = get_max_evaluation(tdw)
window_features['nTraining'] = get_n_training_window(tdw)
window_features['nTrainingOptimal'] = get_n_training_optimal(n_weeks)
window_features['nTrainingPrWeek'] = get_n_trainings_per_week(n_weeks,
window_features['NumberTraining'] = get_n_training_window(tdw)
window_features['NumberTrainingOptimal'] = get_n_training_optimal(n_weeks)
window_features['NumberTrainingPerWeek'] = get_n_trainings_per_week(n_weeks,
get_n_training_window(tdw))
training_pr_week = get_training_per_week(tdw, start_date)
n_weeks_with_training = get_n_weeks_with_trainings(tdw, start_date)
window_features['nTrainingPrWeekMax'] = get_n_training_per_week_max(training_pr_week)
window_features['nTrainingPrWeekMin'] = get_n_training_per_week_min(training_pr_week,
window_features['NumberTrainingPerWeekMax'] = get_n_training_per_week_max(training_pr_week)
window_features['NumberTrainingPerWeekMin'] = get_n_training_per_week_min(training_pr_week,
n_weeks_with_training, n_weeks)
window_features['nWeeksWithTrainings'] = n_weeks_with_training
window_features['TimeBetweenTrainingsAvg'] = get_avg_time_between_trainings(tdw)
window_features['NumberWeeksWithTraining'] = n_weeks_with_training
window_features['TimeBetweenTrainingAvg'] = get_avg_time_between_trainings(tdw)
window_features['nWeeksWithTrainingsIn12Weeks'] = get_n_weeks_with_training_first_12(
window_features['NumberTrainingFirstTwelveWeeks'] = get_n_weeks_with_training_first_12(
tdw, start_date)
window_features['SP-START'] = get_successful_program_start(tdw, start_date)
window_features['SP-END'] = get_successful_program_end(tdw, start_date, end_date)
window_features['SP-ALL'] = get_successful_program_all(citizen_data.td)
window_features['NumberTrainingLastTwelveWeeks'] = get_n_weeks_with_training_last_12(tdw,
start_date, end_date)
#window_features['SP-START'] = get_successful_program_start(tdw, start_date)
#window_features['SP-END'] = get_successful_program_end(tdw, start_date, end_date)
#window_features['SP-ALL'] = get_successful_program_all(citizen_data.td)
n_cancel = tcw.shape[0]
cancels_per_week = get_cancels_per_week(tcw)
window_features['nWeeksWithoutTrainings'] = get_n_weeks_without_training(n_weeks,
window_features['NumberWeeksNoTraining'] = get_n_weeks_without_training(n_weeks,
n_weeks_with_training)
window_features['nCancellations'] = n_cancel
window_features['NumberCancels'] = n_cancel
window_features['TimeBetweenCancelsAvg'] = get_avg_time_between_cancels(tcw)
window_features['nCancellationsPrWeekAVG'] = get_avg_cancels_per_week(n_cancel, n_weeks)
window_features['nCancellationsPrWeekMin'] = get_n_cancel_per_week_min(cancels_per_week)
window_features['nCancellationsPrWeekMax'] = get_n_training_per_week_max(cancels_per_week)
window_features['NumberCancelsPerWeekAvg'] = get_avg_cancels_per_week(n_cancel, n_weeks)
window_features['NumberCancelsPerWeekMin'] = get_n_cancel_per_week_min(cancels_per_week)
window_features['NumberCancelsPerWeekMax'] = get_n_training_per_week_max(cancels_per_week)
window_features['NumberATsRunning'] = get_number_of_ats_running(citizen_data.ats, end_date)
window_features['NewAts'] = pd.Series([get_new_at(huw)])
......@@ -129,9 +132,11 @@ def create_window_features(id, data):
window_features['Physics'] = get_physics_indicator(
pre_screening.PhysicalStrengthScore, screening.PhysicalStrengthScore)
exercises = pd.Series([get_exercises(pre_screening.ExerciseContent)])
window_features['RehabIndicator'] = get_rehab_indicator(pre_screening.NeedForHelpScore,
pre_screening.PhysicalStrengthScore)
window_features['Exercises'] = pd.Series([get_exercises(pre_screening.ExerciseContent)])
window_features['Exercises'] = exercises
window_features['NumberExercises'] = get_number_of_exercises(get_exercises(pre_screening.ExerciseContent))
window_features['LastStatus'] = get_last_status(ssw)
id_features = pd.concat([id_features, window_features], axis=0, ignore_index=True)
......@@ -139,6 +144,9 @@ def create_window_features(id, data):
return id_features
def get_number_of_exercises(exercises):
return len(exercises)
def get_start_year(pre_screening):
return pd.to_datetime(pre_screening.ScreeningDate).year
......
......@@ -14,7 +14,7 @@ def filter_second_screening(X):
return X
def filter_ideal_candidates(X):
X = X.loc[(X['nWeeksSum'] >= 8) & (X['nTrainingsSum'] >= 7)]
X = X.loc[(X['NumberWeeksSum'] >= 8) & (X['NumberTrainingSum'] >= 7)]
X = X.drop_duplicates(subset='CitizenId')
X = X.reset_index(drop=True)
return X
......@@ -38,6 +38,6 @@ def filter_sp2(X):
return X
def filter_total_weeks(X, min_total, max_total):
X['nWeeksTotal'] = X.groupby(['CitizenId'])['nWeeks'].transform(pd.Series.sum)
X = X.loc[(X['nWeeksTotal'] >= min_total) & (X['nWeeksTotal'] <= max_total)]
X['NumberWeeksTotal'] = X.groupby(['CitizenId'])['NumberWeeks'].transform(pd.Series.sum)
X = X.loc[(X['NumberWeeksTotal'] >= min_total) & (X['NumberWeeksTotal'] <= max_total)]
return X
......@@ -150,8 +150,8 @@ def cols_exist(df, cols):
return set(cols).issubset(df.columns)
def make_citizen_features(X):
X['nWeeksSum'] = X.groupby('CitizenId')['nWeeks'].transform(pd.Series.cumsum)
X['nTrainingsSum'] = X.groupby('CitizenId')['nTraining'].transform(pd.Series.cumsum)
X['NumberWeeksSum'] = X.groupby('CitizenId')['NumberWeeks'].transform(pd.Series.cumsum)
X['NumberTrainingSum'] = X.groupby('CitizenId')['NumberTraining'].transform(pd.Series.cumsum)
X['NeedsStartBaseline'] = X.groupby('CitizenId')["NeedsStart"].transform('first')
return X
......@@ -169,7 +169,7 @@ def get_diff(row):
diff_pct = (row['NeedsStartBaseline'] - row['NeedsEnd']) / row['NeedsStartBaseline']
return 1 if diff_pct >= 0.1 else 0
else:
raise ValueError('NeedsStartBaseline was zero, cannot compute need')
raise ValueError('NeedsStartBaseline was zero, cannot compute needs')
def scale_features(features):
scaler = MinMaxScaler(feature_range=(0, 1))
......
......@@ -44,8 +44,8 @@ def run():
X = pp.encode_vector_dummy(X, cfg.LIST_COLS)
X = pp.replace_numerical_inf(X)
save_model(X, cfg.PROCESSED_DATA_DIR, f'X_cv_ready_relative.csv')
save_model(y, cfg.PROCESSED_DATA_DIR, f'y_cv_ready_relative.csv')
save_model(X, cfg.PROCESSED_DATA_DIR, f'X_relative.csv')
save_model(y, cfg.PROCESSED_DATA_DIR, f'y_relative.csv')
train_rf(X, y)
def train_rf(X, y):
......@@ -74,8 +74,8 @@ def run_features_absolute():
X = pp.encode_vector_dummy(X, cfg.LIST_COLS)
X = pp.replace_numerical_inf(X)
save_model(X, cfg.PROCESSED_DATA_DIR, f'X_cv_ready_{threshold}.csv')
save_model(y, cfg.PROCESSED_DATA_DIR, f'y_cv_ready_{threshold}.csv')
save_model(X, cfg.PROCESSED_DATA_DIR, f'X_{threshold}.csv')
save_model(y, cfg.PROCESSED_DATA_DIR, f'y_{threshold}.csv')
if __name__ == "__main__":
run()
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment