Commit 1d245811 authored by thecml's avatar thecml
Browse files

updated code for new io, added models to git

parent bf4eb4cc
Pipeline #94662 failed with stage
in 4 minutes and 58 seconds
...@@ -80,11 +80,6 @@ target/ ...@@ -80,11 +80,6 @@ target/
!/data/**/ !/data/**/
!/data/**/.gitkeep !/data/**/.gitkeep
# exclude models dir
/models/**
!/models/**/
!/models/**/.gitkeep
# exclude reports dir # exclude reports dir
/reports/** /reports/**
!/reports/**/ !/reports/**/
......
...@@ -51,7 +51,7 @@ def main(): ...@@ -51,7 +51,7 @@ def main():
outfile = BytesIO() outfile = BytesIO()
file_writer.write_pickle(assistive_aids, outfile) file_writer.write_pickle(assistive_aids, outfile)
outfile.seek(0) outfile.seek(0)
shutil.copyfileobj(outfile, fd) shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ic.pkl'), 'wb') as fd: with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ic.pkl'), 'wb') as fd:
outfile = BytesIO() outfile = BytesIO()
......
#!/usr/bin/env python
import numpy as np
import pandas as pd
import paths as pt
import yaml
from typing import List
from kmodes import kmodes
from pathlib import Path
from tools import file_reader, file_writer, preprocessor
ATS_RESOLUTION = 50
def main():
df = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=ATS_RESOLUTION)
cols_ats = [str(i)+'Ats' for i in range(1, ATS_RESOLUTION+1)]
header_list = ['CitizenId'] + cols_ats
df = df[header_list]
model = kmodes.KModes(init='Huang', n_clusters=20, n_init=15, n_jobs=-1)
model.fit(df.iloc[:, 1:].astype(str))
predictions = model.predict(df.iloc[:, 1:].to_numpy())
clusters = pd.Series(predictions, name="Cluster")
ats_sequence = df[cols_ats].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
clusters = pd.DataFrame({
'CitizenId': df['CitizenId'],
'Cluster': clusters,
'Ats': ats_sequence
})
cluster_centroids = pd.DataFrame(dict([i for i in zip(range(0, len(model.cluster_centroids_)),
model.cluster_centroids_)]))
file_writer.write_joblib(model, pt.CLUSTERS_DIR, 'km.joblib')
file_writer.write_csv(cluster_centroids, pt.INTERIM_DATA_DIR, f'cluster_centroids.csv')
file_writer.write_csv(clusters, pt.INTERIM_DATA_DIR, 'cl.csv')
if __name__ == '__main__':
main()
#!/usr/bin/env python #!/usr/bin/env python
import paths as pt import paths as pt
from tools import file_reader, file_writer from tools import file_reader, file_writer, data_loader
from tools import preprocessor from tools import preprocessor
from utility import embedder from utility import embedder
from utility.settings import load_settings from utility.settings import load_settings
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
import yaml from io import StringIO
import shutil
from pathlib import Path
def main(): def main():
for label_name in ["Complete", "Compliance", "Fall"]: for label_name in ["Complete", "Compliance", "Fall"]:
data_settings = load_settings('data.yaml') data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
ats = {str(i)+'Ats':str for i in range(1, data_settings['ats_resolution']+1)} ats_resolution = data_settings['ats_resolution']
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
f'{label_name.lower()}.csv',
converters=ats) infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
cols_ats = [str(i)+'Ats' for i in range(1, data_settings['ats_resolution']+1)] cols_ats = [str(i)+'Ats' for i in range(1, data_settings['ats_resolution']+1)]
unique_ats = [df[f'{i}Ats'].unique() for i in range(1, data_settings['ats_resolution']+1)] unique_ats = [df[f'{i}Ats'].unique() for i in range(1, data_settings['ats_resolution']+1)]
...@@ -28,7 +36,13 @@ def main(): ...@@ -28,7 +36,13 @@ def main():
df = df.drop(cols_ats, axis=1) df = df.drop(cols_ats, axis=1)
df = pd.concat([df.drop(label_name, axis=1), df_ats, df[[label_name]]], axis=1) df = pd.concat([df.drop(label_name, axis=1), df_ats, df[[label_name]]], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{label_name.lower()}_count.csv') outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_count.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
...@@ -21,11 +21,16 @@ def main(ats_resolution: int = None): ...@@ -21,11 +21,16 @@ def main(ats_resolution: int = None):
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml') data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
if ats_resolution == None: if ats_resolution == None:
ats_resolution = data_settings['ats_resolution'] ats_resolution = data_settings['ats_resolution']
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)} ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{label_name.lower()}.csv', infile = StringIO()
converters=ats) file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1) emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1 n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
......
...@@ -6,22 +6,36 @@ from utility import embedder ...@@ -6,22 +6,36 @@ from utility import embedder
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from pathlib import Path from pathlib import Path
import yaml
from utility.settings import load_settings from utility.settings import load_settings
from io import StringIO
import shutil
def main(): def main():
for label_name in ["Complete", "Compliance", "Fall"]: for label_name in ["Complete", "Compliance", "Fall"]:
settings = load_settings("data.yaml") settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats = {str(i)+'Ats':str for i in range(1, settings['ats_resolution']+1)} ats = {str(i)+'Ats':str for i in range(1, settings['ats_resolution']+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, f'{label_name.lower()}.csv', converters=ats)
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
ats_cols = [str(i)+'Ats' for i in range(1, settings['ats_resolution']+1)] ats_cols = [str(i)+'Ats' for i in range(1, settings['ats_resolution']+1)]
df_enc = preprocessor.one_hot_encode(df, ats_cols) df_enc = preprocessor.one_hot_encode(df, ats_cols)
df = pd.concat([df.drop(ats_cols + [label_name], axis=1), df = pd.concat([df.drop(ats_cols + [label_name], axis=1),
df_enc, df[[label_name]]], axis=1) df_enc, df[[label_name]]], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{label_name.lower()}_ohe.csv') outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_ohe.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
#!/usr/bin/env python #!/usr/bin/env python
from numpy.lib import utils
from tools import file_reader, file_writer, preprocessor from tools import file_reader, file_writer, preprocessor
from utility.settings import load_settings from utility.settings import load_settings
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import paths as pt import paths as pt
import yaml from io import BytesIO
import shutil
def main(): def main():
df = file_reader.read_pickle(pt.INTERIM_DATA_DIR, 'ats.pkl') with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ats.pkl'), 'rb') as fd:
infile = BytesIO()
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_pickle(infile)
settings = load_settings(pt.CONFIGS_DIR, "data.yaml") settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats_iso_length = settings['ats_iso_length'] ats_iso_length = settings['ats_iso_length']
...@@ -118,7 +123,11 @@ def main(): ...@@ -118,7 +123,11 @@ def main():
resolution=ats_resolution) resolution=ats_resolution)
data_dict = {'x': data_x, 'y': data_y} data_dict = {'x': data_x, 'y': data_y}
file_writer.write_pickle(data_dict, pt.PROCESSED_DATA_DIR, "alarm_data.pkl") with open(Path.joinpath(pt.PROCESSED_DATA_DIR, "alarm_data.pkl"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(data_dict, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
...@@ -8,9 +8,13 @@ from sklearn.metrics import accuracy_score, precision_score ...@@ -8,9 +8,13 @@ from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc
from pandas.api.types import is_string_dtype, is_numeric_dtype from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
import tensorflow as tf import tensorflow as tf
from io import StringIO
import shutil
from pathlib import Path
CASE = "Complete" CASE = "Complete"
FILENAME = "complete.csv" FILENAME = "complete.csv"
...@@ -116,10 +120,14 @@ def build_embedding_network(cat_cols, num_cols): ...@@ -116,10 +120,14 @@ def build_embedding_network(cat_cols, num_cols):
def main(): def main():
ats_cols = {str(i)+'Ats':str for i in range(1, ATS_RESOLUTION+1)} ats_cols = {str(i)+'Ats':str for i in range(1, ATS_RESOLUTION+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
FILENAME, infile = StringIO()
converters=ats_cols) file_path = pt.PROCESSED_DATA_DIR
with open(Path.joinpath(file_path, FILENAME), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats_cols)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1) emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1 n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
...@@ -129,7 +137,7 @@ def main(): ...@@ -129,7 +137,7 @@ def main():
# Prepare the data # Prepare the data
X, y = preprocessor.get_X_y(df, CASE) X, y = preprocessor.get_X_y(df, CASE)
X, labels = preprocessor.encode_vector_label(X, n_numerical_cols) X, _ = preprocessor.encode_vector_label(X, n_numerical_cols)
y = np.array(y) y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
...@@ -200,9 +208,8 @@ def main(): ...@@ -200,9 +208,8 @@ def main():
print("Final recall: %.5f" % recall_score(y_test, y_scores_new)) print("Final recall: %.5f" % recall_score(y_test, y_scores_new))
print("Final rocauc: %.5f" % roc_auc_score(y_test, y_pred_final)) print("Final rocauc: %.5f" % roc_auc_score(y_test, y_pred_final))
from sklearn.metrics import precision_recall_curve, auc
precision, recall, _ = precision_recall_curve(y_test, y_pred_final) precision, recall, _ = precision_recall_curve(y_test, y_pred_final)
print("Final prauc: %.5f" % auc(recall, precision)) print("Final prauc: %.5f" % auc(recall, precision))
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
#!/usr/bin/env python #!/usr/bin/env python
import pandas as pd import pandas as pd
import paths as pt import paths as pt
from pathlib import Path
from tools import file_reader, file_writer from tools import file_reader, file_writer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from utility.settings import load_settings from utility.settings import load_settings
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest from sksurv.ensemble import RandomSurvivalForest
from io import StringIO, BytesIO
import shutil
USE_LABEL_ENC = True USE_LABEL_ENC = True
def main(): def main():
settings = load_settings(pt.CONFIGS_DIR, "data.yaml") settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats_resolution = settings['ats_resolution'] ats_resolution = settings['ats_resolution']
data = file_reader.read_pickle(pt.PROCESSED_DATA_DIR, "alarm_data.pkl")
with open(Path.joinpath(pt.PROCESSED_DATA_DIR, "alarm_data.pkl"), 'rb') as fd:
infile = BytesIO()
shutil.copyfileobj(fd, infile)
infile.seek(0)
data = file_reader.read_pickle(infile)
data_x = data['x'] data_x = data['x']
data_y = data['y'] data_y = data['y']
...@@ -37,8 +46,17 @@ def main(): ...@@ -37,8 +46,17 @@ def main():
n_jobs=-1, random_state=0) n_jobs=-1, random_state=0)
model.fit(data_x, data_y) model.fit(data_x, data_y)
file_writer.write_pickle(labels_enc, pt.MODELS_DIR, "alarm_labels.pkl") with open(Path.joinpath(pt.MODELS_DIR, "alarm_labels.pkl"), 'wb') as fd:
file_writer.write_joblib(model, pt.MODELS_DIR, "alarm_rsf.joblib") outfile = BytesIO()
file_writer.write_pickle(labels_enc, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.MODELS_DIR, "alarm_rsf.joblib"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_joblib(model, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
...@@ -3,14 +3,15 @@ import numpy as np ...@@ -3,14 +3,15 @@ import numpy as np
import pandas as pd import pandas as pd
import paths as pt import paths as pt
from tools import file_reader, file_writer, data_loader from tools import file_reader, file_writer, data_loader
from utility import metrics from utility.settings import load_settings
from sklearn.metrics import accuracy_score, precision_score from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import xgboost as xgb import xgboost as xgb
from pathlib import Path from pathlib import Path
import yaml from io import BytesIO
import shutil
CASES = ["Complete", "Compliance", "Fall"] CASES = ["Complete", "Compliance", "Fall"]
DATASET_VERSION = 'emb' DATASET_VERSION = 'emb'
...@@ -18,23 +19,26 @@ DATASET_VERSION = 'emb' ...@@ -18,23 +19,26 @@ DATASET_VERSION = 'emb'
def main(): def main():
for case in CASES: for case in CASES:
if case == "Complete": if case == "Complete":
with open(Path.joinpath(pt.CONFIGS_DIR, "complete.yaml"), 'r') as stream: settings = load_settings(pt.CONFIGS_DIR, "complete.yaml")
settings = yaml.safe_load(stream)
file_name = f'complete_{DATASET_VERSION}.csv' file_name = f'complete_{DATASET_VERSION}.csv'
dl = data_loader.CompleteDataLoader(file_name, settings).load_data() dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
X, y = dl.get_data() file_name,
settings).load_data()
X, y = dl.prepare_data()
elif case == "Compliance": elif case == "Compliance":
with open(Path.joinpath(pt.CONFIGS_DIR, "compliance.yaml"), 'r') as stream: settings = load_settings(pt.CONFIGS_DIR, "compliance.yaml")
settings = yaml.safe_load(stream) file_name = f'compliance_{DATASET_VERSION}.csv'
file_name = f'compliance_{DATASET_VERSION}.csv' dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
dl = data_loader.ComplianceDataLoader(file_name, settings).load_data() file_name,
X, y = dl.get_data() settings).load_data()
X, y = dl.prepare_data()
else: else:
with open(Path.joinpath(pt.CONFIGS_DIR, "fall.yaml"), 'r') as stream: settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
settings = yaml.safe_load(stream)
file_name = f'fall_{DATASET_VERSION}.csv' file_name = f'fall_{DATASET_VERSION}.csv'
dl = data_loader.FallDataLoader(file_name, settings).load_data() dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
X, y = dl.get_data() file_name,
settings).load_data()
X, y = dl.prepare_data()
neg, pos = np.bincount(y) neg, pos = np.bincount(y)
scale_pos_weight = neg / pos scale_pos_weight = neg / pos
...@@ -48,8 +52,14 @@ def main(): ...@@ -48,8 +52,14 @@ def main():
model = xgb.XGBClassifier(**params) model = xgb.XGBClassifier(**params)
model.fit(X, y) model.fit(X, y)
file_writer.write_joblib(model, pt.MODELS_DIR, f'{case.lower()}_xgb.joblib') file_path = pt.MODELS_DIR
file_name = f'{case.lower()}_xgb.joblib'
with open(Path.joinpath(file_path, file_name), 'wb') as fd:
outfile = BytesIO()
file_writer.write_joblib(model, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
\ No newline at end of file
#!/usr/bin/env python
import paths as pt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from tools import file_reader, file_writer, preprocessor, data_loader, inputter
from kmodes import kmodes
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
import paths as pt
import yaml
class ClusterMaker(BaseEstimator, TransformerMixin):
def __init__(self, init='random', n_clusters=1, n_init=1, ats_resolution=10):
self.init = init
self.n_clusters = n_clusters
self.n_init = n_init
self.ats_resolution = ats_resolution
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
model = kmodes.KModes(init=self.init, n_clusters=self.n_clusters,
n_init=self.n_init,
cat_dissim=kmodes.ng_dissim,
n_jobs=-1)
model.fit(X.iloc[:,-self.ats_resolution:].astype(str))
predictions = model.predict(X.iloc[:,-self.ats_resolution:].astype(str))
predictions = pd.Series(predictions, name="Cluster")
X = X.iloc[: , :-self.ats_resolution].reset_index(drop=True)
X['Cluster'] = predictions
return X
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
# Load screenings and make ats from them
screenings = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df_ats = preprocessor.split_cat_columns(screenings, col_to_split='Ats', tag='Ats', resolution=10)
df_ats = inputter.make_complete_feature(df_ats, settings)
ats_cols = df_ats.filter(regex='((\d+)[Ats])\w+', axis=1)
# Load processed emb dataset
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
# Merge datasets
X = pd.concat([X, ats_cols], axis=1)
pipeline = Pipeline([
('cluster_maker', ClusterMaker()),
('clf', RandomForestClassifier(random_state=0, class_weight="balanced"))
])