Commit 6128cde2 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

added clusters back to features, improved profiling

parent 805143ae
Pipeline #44014 failed with stage
in 3 minutes and 10 seconds
%% Cell type:code id: tags:
```
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
ats = {str(i)+'Ats':str for i in range(1,11)}
converters = {**ats}
df = pd.read_csv('../data/processed/fall.csv', converters=converters)
```
%% Cell type:code id: tags:
```
columns = [df[f'{i}Ats'].unique() for i in range(1,11)]
columns = list(set(np.concatenate(columns)))
```
%% Cell type:code id: tags:
```
pd.pivot_table(df, index=['Gender', 'BirthYear', 'NumberAts', 'Fall'], columns=df.columns[4:], aggfunc=np.sum)
```
%% Output
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-37-e853626613f8> in <module>
----> 1 pd.pivot_table(df, index=['Gender', 'BirthYear', 'NumberAts', 'Fall'], columns=df.columns[4:], aggfunc=np.sum)
~\miniconda3\envs\py38-air\lib\site-packages\pandas\core\reshape\pivot.py in pivot_table(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name, observed)
109 values = list(values)
110
--> 111 grouped = data.groupby(keys, observed=observed)
112 agged = grouped.agg(aggfunc)
113 if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
~\miniconda3\envs\py38-air\lib\site-packages\pandas\core\frame.py in groupby(self, by, axis, level, as_index, sort, group_keys, squeeze, observed, dropna)
6509 axis = self._get_axis_number(axis)
6510
-> 6511 return DataFrameGroupBy(
6512 obj=self,
6513 keys=by,
~\miniconda3\envs\py38-air\lib\site-packages\pandas\core\groupby\groupby.py in __init__(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, observed, mutated, dropna)
523 from pandas.core.groupby.grouper import get_grouper
524
--> 525 grouper, exclusions, obj = get_grouper(
526 obj,
527 keys,
~\miniconda3\envs\py38-air\lib\site-packages\pandas\core\groupby\grouper.py in get_grouper(obj, key, axis, level, sort, observed, mutated, validate, dropna)
796 # allow us to passing the actual Grouping as the gpr
797 ping = (
--> 798 Grouping(
799 group_axis,
800 gpr,
~\miniconda3\envs\py38-air\lib\site-packages\pandas\core\groupby\grouper.py in __init__(self, index, grouper, obj, name, level, sort, observed, in_axis, dropna)
425 self.name = name
426 self.level = level
--> 427 self.grouper = _convert_grouper(index, grouper)
428 self.all_grouper = None
429 self.index = index
~\miniconda3\envs\py38-air\lib\site-packages\pandas\core\groupby\grouper.py in _convert_grouper(axis, grouper)
837 elif isinstance(grouper, (list, Series, Index, np.ndarray)):
838 if len(grouper) != len(axis):
--> 839 raise ValueError("Grouper and axis must be same length")
840 return grouper
841 else:
ValueError: Grouper and axis must be same length
feature,shap_values,feat_imp
3Ats,0.35438943,0.05037573
2Ats,0.28799805,0.041522097
7Ats,0.25838324,0.045328997
4Ex,0.25140244,0.041111276
8Ats,0.22756055,0.047804035
5Ex,0.2272673,0.03368269
3Ex,0.21831329,0.03804887
4Ats,0.21751775,0.041362703
6Ats,0.21655297,0.035452005
6Ex,0.21542017,0.036072284
Needs,0.21310568,0.032251004
10Ats,0.20146725,0.047944907
1Ats,0.19626786,0.04442799
2Ex,0.19375794,0.039941315
5Ats,0.18881403,0.04191209
1Ex,0.16665366,0.035134666
7Ex,0.14979053,0.041033033
8Ex,0.12900291,0.044927623
9Ats,0.12805317,0.04935159
Physics,0.10826816,0.027378794
BirthYear,0.10358753,0.02620498
MeanEvaluation,0.098001756,0.025608635
NumberFalls,0.093598895,0.033897128
NumberAts,0.06784952,0.025121937
Gender,0.032643776,0.021802304
NumberExercises,0.023147892,0.021525797
9Ex,0.015966184,0.023029841
NumberCancels,0.0019818926,0.00774568
MaxEvaluation,0.0,0.0
3Ats,0.31289357,0.047934923
2Ats,0.29975817,0.043996885
7Ats,0.29632568,0.04681093
8Ats,0.2858649,0.04623301
4Ats,0.25710154,0.04035271
4Ex,0.25318545,0.038030393
1Ats,0.24223052,0.04620158
5Ats,0.23203886,0.044167936
10Ats,0.22442655,0.047305025
3Ex,0.21373507,0.038372926
6Ex,0.17370935,0.0344399
5Ex,0.17300294,0.033084366
2Ex,0.17278673,0.034701586
Needs,0.16645956,0.02923049
9Ats,0.14796755,0.04421669
Cluster,0.13645108,0.027709806
8Ex,0.12976792,0.047058653
BirthYear,0.12052875,0.026461434
6Ats,0.11824282,0.032616466
7Ex,0.11497209,0.03494578
1Ex,0.11381523,0.02706626
MeanEvaluation,0.104623675,0.025604129
Physics,0.097208135,0.026487732
NumberAts,0.08153105,0.02365695
NumberFalls,0.07433112,0.029914727
Gender,0.03533531,0.028777694
NumberExercises,0.028090222,0.021228254
9Ex,0.025323618,0.032417543
NumberCancels,0.00019256209,0.0009752363
MaxNumberCancelsWeek,0.0,0.0
MaxNumberTrainingWeek,0.0,0.0
MaxEvaluation,0.0,0.0
MinEvaluation,0.0,0.0
MinNumberCancelsWeek,0.0,0.0
MinNumberTrainingWeekMin,0.0,0.0
......
feature,shap_values,feat_imp
BirthYear,0.41616747,0.030733922
1Ats,0.4063895,0.06565259
8Ats,0.3453722,0.2556873
2Ats,0.34433904,0.11163904
6Ats,0.28929475,0.15771021
NumberAts,0.22185978,0.036133148
10Ats,0.20745935,0.08296268
4Ats,0.20453438,0.043867916
9Ats,0.17320465,0.1028832
3Ats,0.14739995,0.034372207
5Ats,0.07399562,0.029653512
7Ats,0.061252814,0.03167247
Gender,0.017475614,0.017031824
BirthYear,0.4606245,0.031895686
1Ats,0.45709953,0.06361309
2Ats,0.3448231,0.10200407
8Ats,0.34153026,0.24633452
6Ats,0.29085833,0.15906748
NumberAts,0.24784136,0.035005152
10Ats,0.21734425,0.08265754
4Ats,0.21225475,0.043335676
Cluster,0.20631586,0.02699889
9Ats,0.18807764,0.09792797
3Ats,0.14401208,0.033725616
5Ats,0.086939245,0.02986654
7Ats,0.066484466,0.031133974
Gender,0.02026758,0.016433802
This diff is collapsed.
This diff is collapsed.
......@@ -17,7 +17,7 @@ from sklearn.utils import shuffle
from typing import List, Dict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
CASE = "Complete"
CASE = "Fall"
COMPLETE_FILENAME = "complete_with_embeddings.csv"
FALL_FILENAME = "fall_with_embeddings.csv"
OUTPUT_FILENAME = f"Best {CASE} features.csv"
......
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from tools import file_reader, file_writer
import config as cfg
def main():
# Load data
ex = {str(i)+'Ex':str for i in range(1,10)}
ats = {str(i)+'Ats':str for i in range(1,11)}
complete = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv', converters={**ex, **ats})
fall = file_reader.read_csv(cfg.PROCESSED_DATA_DIR, 'fall.csv',
converters=ats)
# Generate and save profiles
profile_completed = ProfileReport(complete)
profile_fall = ProfileReport(fall)
file_writer.write_report(profile_completed, cfg.REPORTS_DIR, 'profile_complete.html')
file_writer.write_report(profile_fall, cfg.REPORTS_DIR, 'profile_fall.html')
if __name__ == "__main__":
main()
\ No newline at end of file
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from tools import data_loader
import config as cfg
CASE = "Complete"
COMPLETE_FILENAME = "complete_with_embeddings.csv"
FALL_FILENAME = "fall_with_embeddings.csv"
SCALING_STRATEGY = "Standard"
def main():
pipeline = Pipeline([
('scaler', StandardScaler()),
('clf', 'passthrough')])
n_neighbors = [2, 4, 6, 8, 10, 15, 20, 50, 100, 500]
max_depth = [2, 4, 6, 8, 10, 12, 14, 16, 25, 50]
n_estimators = [100, 150, 250, 300, 350, 400, 500, 600, 700, 1000, 2000, 5000]
max_features = [4, 5, 6, 8, 10, 20, 36]
min_samples_leaf = [5, 10, 15, 20, 25, 30, 35, 40]
c = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e5, 1e10]
penalty = ['none', 'l2']
kernel = ['linear', 'poly', 'rbf']
var_smoothing = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-2, 1e-1, 1e2, 1e5]
param_grid = [
{
'clf': [KNeighborsClassifier()],
'clf__n_neighbors': n_neighbors
},
{
'clf': [RandomForestClassifier(random_state=0)],
'clf__max_depth': max_depth,
'clf__n_estimators': n_estimators,
'clf__max_features': max_features,
'clf__min_samples_leaf': min_samples_leaf
},
{
'clf': [LogisticRegression(random_state=0)],
'clf__C': c,
'clf__penalty': penalty
},
{
'clf': [SVC(random_state=0)],
'clf__kernel': kernel,
'clf__C': c
},
{
'clf': [GaussianNB()],
'clf__var_smoothing': var_smoothing
}
]
if CASE == "Complete":
X, y = data_loader.CompleteDataLoader(COMPLETE_FILENAME, cfg.COMPLETE_N_SCALE_COLS) \
.load_data().prepare_data(SCALING_STRATEGY)
else:
X, y = data_loader.FallDataLoader(FALL_FILENAME, cfg.FALL_N_SCALE_COLS) \
.load_data().prepare_data(SCALING_STRATEGY)
grid = GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=param_grid, scoring="accuracy")
res = grid.fit(X, y)
def report(results, n_top=10):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})"
.format(results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")
report(res.cv_results_)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -42,8 +42,8 @@ GENERAL_FEATURES = ['Gender', 'Age', 'Cluster']
THRESHOLD_WEEKS = 8
THRESHOLD_TRAINING = 10
COMPLETE_N_SCALE_COLS = 17
FALL_N_SCALE_COLS = 3
COMPLETE_N_SCALE_COLS = 18
FALL_N_SCALE_COLS = 4
PATIENT_ID = 'PatientId'
CITIZEN_ID = 'CitizenId'
......
......@@ -2,11 +2,22 @@
import config as cfg
from tools import file_reader, file_writer
from tools import feature_maker, preprocessor
import pandas as pd
def main():
converters = {'CitizenId': str}
df = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'timeseries.csv', converters=converters)
cl = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'cl.csv',
converters={'CitizenId': str, 'Cluster': int})
df = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'timeseries.csv',
converters={'CitizenId': str})
# Assign cluster
df = pd.merge(df, cl[['CitizenId', 'Cluster']], how='inner', on=['CitizenId'])
df['Cluster'] = df['Cluster'].fillna(0)
# Assign completed
df = feature_maker.make_number_completed(df)
# Split categorial columns
df = preprocessor.split_categorical_columns(df, col='Exercises', tag='Ex', resolution=10)
df = preprocessor.split_categorical_columns(df, col='Ats', tag='Ats', resolution=10)
......@@ -22,6 +33,9 @@ def main():
df = preprocessor.drop_count_columns(df)
df = preprocessor.drop_zero_columns(df)
# Convert types
df['MeanEvaluation'] = pd.Series.astype(df['MeanEvaluation'], dtype=int)
file_writer.write_csv(df, cfg.PROCESSED_DATA_DIR, f'complete.csv')
if __name__ == "__main__":
......
......@@ -20,8 +20,7 @@ def main():
'complete.csv',
converters=converters)
n_numerical_cols = len(list(df.select_dtypes(exclude = ['object']))) - 1
embedded_df = df.iloc[:, n_numerical_cols:df.shape[1]-1]
embedded_df = df.iloc[:, cfg.COMPLETE_N_SCALE_COLS:df.shape[1]-1]
for index in range(embedded_df.shape[1]):
column = embedded_df.columns[index]
labels_column = labels[index]
......
......@@ -7,6 +7,8 @@ def main():
# Load the data
ats = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl')
fd = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'fd.pkl')
cl = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'cl.csv',
converters={'CitizenId': str, 'Cluster': int})
# Remove duplicates from falls
fd = fd.drop_duplicates(["CitizenId", "Date"])
......@@ -28,6 +30,9 @@ def main():
df = df.agg({'DevISOClass': "count"}).reset_index()
df = df.rename(columns={'DevISOClass': 'NumberAts'})
# Assign cluster
df = pd.merge(df, cl[['CitizenId', 'Cluster']], how='inner', on=['CitizenId'])
# Reduce length of ats and covert to a string
df_ats[cfg.DEV_ISO_CLASS] = df_ats[cfg.DEV_ISO_CLASS].apply(lambda at: str(at)[0:6])
df_ats = preprocessor.get_ats_list(df_ats)
......@@ -60,6 +65,7 @@ def main():
# Convert types
df['BirthYear'] = pd.Series.astype(df['BirthYear'], dtype=int)
df['NumberAts'] = pd.Series.astype(df['NumberAts'], dtype=int)
df['Cluster'] = pd.Series.astype(df['Cluster'], dtype=int)
df['Fall'] = pd.Series.astype(df['Fall'], dtype=int)
# Make file with ats + patient data
......
......@@ -17,9 +17,8 @@ def main():
df = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'fall.csv',
converters=converters)
n_numerical_cols = 3
embedded_df = df.iloc[:, n_numerical_cols:df.shape[1]-1]
embedded_df = df.iloc[:, cfg.FALL_N_SCALE_COLS:df.shape[1]-1]
for index in range(embedded_df.shape[1]):
column = embedded_df.columns[index]
labels_column = labels[index]
......
......@@ -46,16 +46,8 @@ def write_tensorflow_model(model: tf.keras.Model,
path: Path) -> None:
model.save(path)
def write_lime_report(exp: LimeTabularExplainer,
path: Path,
file_name: str):
exp.save_to_file(Path.joinpath(path, f'{file_name}.html'))
def write_lime_explanation(explanation: Explanation,
path: Path,
file_name: str):
fig = explanation.as_pyplot_figure()
fig.savefig(Path.joinpath(path, f'{file_name}.jpg'))
def write_report(report, path, filename):
report.to_file(Path.joinpath(path, filename))
def write_shap_summary_plot(shap_values: List[np.ndarray],
X_test: pd.DataFrame,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment