Commit 1d245811 authored by thecml's avatar thecml
Browse files

updated code for new io, added models to git

parent bf4eb4cc
Pipeline #94662 failed with stage
in 4 minutes and 58 seconds
......@@ -80,11 +80,6 @@ target/
!/data/**/
!/data/**/.gitkeep
# exclude models dir
/models/**
!/models/**/
!/models/**/.gitkeep
# exclude reports dir
/reports/**
!/reports/**/
......
......@@ -51,7 +51,7 @@ def main():
outfile = BytesIO()
file_writer.write_pickle(assistive_aids, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ic.pkl'), 'wb') as fd:
outfile = BytesIO()
......
#!/usr/bin/env python
import numpy as np
import pandas as pd
import paths as pt
import yaml
from typing import List
from kmodes import kmodes
from pathlib import Path
from tools import file_reader, file_writer, preprocessor
ATS_RESOLUTION = 50
def main():
df = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats', resolution=ATS_RESOLUTION)
cols_ats = [str(i)+'Ats' for i in range(1, ATS_RESOLUTION+1)]
header_list = ['CitizenId'] + cols_ats
df = df[header_list]
model = kmodes.KModes(init='Huang', n_clusters=20, n_init=15, n_jobs=-1)
model.fit(df.iloc[:, 1:].astype(str))
predictions = model.predict(df.iloc[:, 1:].to_numpy())
clusters = pd.Series(predictions, name="Cluster")
ats_sequence = df[cols_ats].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
clusters = pd.DataFrame({
'CitizenId': df['CitizenId'],
'Cluster': clusters,
'Ats': ats_sequence
})
cluster_centroids = pd.DataFrame(dict([i for i in zip(range(0, len(model.cluster_centroids_)),
model.cluster_centroids_)]))
file_writer.write_joblib(model, pt.CLUSTERS_DIR, 'km.joblib')
file_writer.write_csv(cluster_centroids, pt.INTERIM_DATA_DIR, f'cluster_centroids.csv')
file_writer.write_csv(clusters, pt.INTERIM_DATA_DIR, 'cl.csv')
if __name__ == '__main__':
main()
#!/usr/bin/env python
import paths as pt
from tools import file_reader, file_writer
from tools import file_reader, file_writer, data_loader
from tools import preprocessor
from utility import embedder
from utility.settings import load_settings
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from io import StringIO
import shutil
from pathlib import Path
def main():
for label_name in ["Complete", "Compliance", "Fall"]:
data_settings = load_settings('data.yaml')
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
ats = {str(i)+'Ats':str for i in range(1, data_settings['ats_resolution']+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{label_name.lower()}.csv',
converters=ats)
ats_resolution = data_settings['ats_resolution']
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
cols_ats = [str(i)+'Ats' for i in range(1, data_settings['ats_resolution']+1)]
unique_ats = [df[f'{i}Ats'].unique() for i in range(1, data_settings['ats_resolution']+1)]
......@@ -28,7 +36,13 @@ def main():
df = df.drop(cols_ats, axis=1)
df = pd.concat([df.drop(label_name, axis=1), df_ats, df[[label_name]]], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{label_name.lower()}_count.csv')
outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_count.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -21,11 +21,16 @@ def main(ats_resolution: int = None):
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
if ats_resolution == None:
ats_resolution = data_settings['ats_resolution']
ats = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
f'{label_name.lower()}.csv',
converters=ats)
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
......
......@@ -6,22 +6,36 @@ from utility import embedder
import pandas as pd
import numpy as np
from pathlib import Path
import yaml
from utility.settings import load_settings
from io import StringIO
import shutil
def main():
for label_name in ["Complete", "Compliance", "Fall"]:
settings = load_settings("data.yaml")
settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats = {str(i)+'Ats':str for i in range(1, settings['ats_resolution']+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, f'{label_name.lower()}.csv', converters=ats)
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}.csv'
with open(Path.joinpath(file_path, file_name), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats)
ats_cols = [str(i)+'Ats' for i in range(1, settings['ats_resolution']+1)]
df_enc = preprocessor.one_hot_encode(df, ats_cols)
df = pd.concat([df.drop(ats_cols + [label_name], axis=1),
df_enc, df[[label_name]]], axis=1)
file_writer.write_csv(df, pt.PROCESSED_DATA_DIR, f'{label_name.lower()}_ohe.csv')
outfile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
file_name = f'{label_name.lower()}_ohe.csv'
with open(Path.joinpath(file_path, file_name), 'w', newline='') as fd:
file_writer.write_csv(df, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/env python
from numpy.lib import utils
from tools import file_reader, file_writer, preprocessor
from utility.settings import load_settings
from pathlib import Path
import pandas as pd
import numpy as np
import paths as pt
import yaml
from io import BytesIO
import shutil
def main():
df = file_reader.read_pickle(pt.INTERIM_DATA_DIR, 'ats.pkl')
with open(Path.joinpath(pt.INTERIM_DATA_DIR, 'ats.pkl'), 'rb') as fd:
infile = BytesIO()
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_pickle(infile)
settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats_iso_length = settings['ats_iso_length']
......@@ -118,7 +123,11 @@ def main():
resolution=ats_resolution)
data_dict = {'x': data_x, 'y': data_y}
file_writer.write_pickle(data_dict, pt.PROCESSED_DATA_DIR, "alarm_data.pkl")
with open(Path.joinpath(pt.PROCESSED_DATA_DIR, "alarm_data.pkl"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(data_dict, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -8,9 +8,13 @@ from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from io import StringIO
import shutil
from pathlib import Path
CASE = "Complete"
FILENAME = "complete.csv"
......@@ -116,10 +120,14 @@ def build_embedding_network(cat_cols, num_cols):
def main():
ats_cols = {str(i)+'Ats':str for i in range(1, ATS_RESOLUTION+1)}
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,
FILENAME,
converters=ats_cols)
infile = StringIO()
file_path = pt.PROCESSED_DATA_DIR
with open(Path.joinpath(file_path, FILENAME), 'r') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=ats_cols)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
......@@ -129,7 +137,7 @@ def main():
# Prepare the data
X, y = preprocessor.get_X_y(df, CASE)
X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)
X, _ = preprocessor.encode_vector_label(X, n_numerical_cols)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)
......@@ -200,9 +208,8 @@ def main():
print("Final recall: %.5f" % recall_score(y_test, y_scores_new))
print("Final rocauc: %.5f" % roc_auc_score(y_test, y_pred_final))
from sklearn.metrics import precision_recall_curve, auc
precision, recall, _ = precision_recall_curve(y_test, y_pred_final)
print("Final prauc: %.5f" % auc(recall, precision))
if __name__ == '__main__':
main()
main()
\ No newline at end of file
#!/usr/bin/env python
import pandas as pd
import paths as pt
from pathlib import Path
from tools import file_reader, file_writer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from utility.settings import load_settings
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from io import StringIO, BytesIO
import shutil
USE_LABEL_ENC = True
def main():
settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats_resolution = settings['ats_resolution']
data = file_reader.read_pickle(pt.PROCESSED_DATA_DIR, "alarm_data.pkl")
with open(Path.joinpath(pt.PROCESSED_DATA_DIR, "alarm_data.pkl"), 'rb') as fd:
infile = BytesIO()
shutil.copyfileobj(fd, infile)
infile.seek(0)
data = file_reader.read_pickle(infile)
data_x = data['x']
data_y = data['y']
......@@ -37,8 +46,17 @@ def main():
n_jobs=-1, random_state=0)
model.fit(data_x, data_y)
file_writer.write_pickle(labels_enc, pt.MODELS_DIR, "alarm_labels.pkl")
file_writer.write_joblib(model, pt.MODELS_DIR, "alarm_rsf.joblib")
with open(Path.joinpath(pt.MODELS_DIR, "alarm_labels.pkl"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_pickle(labels_enc, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
with open(Path.joinpath(pt.MODELS_DIR, "alarm_rsf.joblib"), 'wb') as fd:
outfile = BytesIO()
file_writer.write_joblib(model, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__':
main()
\ No newline at end of file
......@@ -3,14 +3,15 @@ import numpy as np
import pandas as pd
import paths as pt
from tools import file_reader, file_writer, data_loader
from utility import metrics
from utility.settings import load_settings
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import xgboost as xgb
from pathlib import Path
import yaml
from io import BytesIO
import shutil
CASES = ["Complete", "Compliance", "Fall"]
DATASET_VERSION = 'emb'
......@@ -18,23 +19,26 @@ DATASET_VERSION = 'emb'
def main():
for case in CASES:
if case == "Complete":
with open(Path.joinpath(pt.CONFIGS_DIR, "complete.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
settings = load_settings(pt.CONFIGS_DIR, "complete.yaml")
file_name = f'complete_{DATASET_VERSION}.csv'
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
file_name,
settings).load_data()
X, y = dl.prepare_data()
elif case == "Compliance":
with open(Path.joinpath(pt.CONFIGS_DIR, "compliance.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
file_name = f'compliance_{DATASET_VERSION}.csv'
dl = data_loader.ComplianceDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
settings = load_settings(pt.CONFIGS_DIR, "compliance.yaml")
file_name = f'compliance_{DATASET_VERSION}.csv'
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
file_name,
settings).load_data()
X, y = dl.prepare_data()
else:
with open(Path.joinpath(pt.CONFIGS_DIR, "fall.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
file_name = f'fall_{DATASET_VERSION}.csv'
dl = data_loader.FallDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
file_name,
settings).load_data()
X, y = dl.prepare_data()
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
......@@ -48,8 +52,14 @@ def main():
model = xgb.XGBClassifier(**params)
model.fit(X, y)
file_writer.write_joblib(model, pt.MODELS_DIR, f'{case.lower()}_xgb.joblib')
file_path = pt.MODELS_DIR
file_name = f'{case.lower()}_xgb.joblib'
with open(Path.joinpath(file_path, file_name), 'wb') as fd:
outfile = BytesIO()
file_writer.write_joblib(model, outfile)
outfile.seek(0)
shutil.copyfileobj(outfile, fd)
if __name__ == '__main__':
main()
\ No newline at end of file
#!/usr/bin/env python
import paths as pt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from tools import file_reader, file_writer, preprocessor, data_loader, inputter
from kmodes import kmodes
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
import paths as pt
import yaml
class ClusterMaker(BaseEstimator, TransformerMixin):
def __init__(self, init='random', n_clusters=1, n_init=1, ats_resolution=10):
self.init = init
self.n_clusters = n_clusters
self.n_init = n_init
self.ats_resolution = ats_resolution
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
model = kmodes.KModes(init=self.init, n_clusters=self.n_clusters,
n_init=self.n_init,
cat_dissim=kmodes.ng_dissim,
n_jobs=-1)
model.fit(X.iloc[:,-self.ats_resolution:].astype(str))
predictions = model.predict(X.iloc[:,-self.ats_resolution:].astype(str))
predictions = pd.Series(predictions, name="Cluster")
X = X.iloc[: , :-self.ats_resolution].reset_index(drop=True)
X['Cluster'] = predictions
return X
def main():
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
# Load screenings and make ats from them
screenings = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df_ats = preprocessor.split_cat_columns(screenings, col_to_split='Ats', tag='Ats', resolution=10)
df_ats = inputter.make_complete_feature(df_ats, settings)
ats_cols = df_ats.filter(regex='((\d+)[Ats])\w+', axis=1)
# Load processed emb dataset
dl = data_loader.CompleteDataLoader("complete_emb.csv", settings).load_data()
X, y = dl.get_data()
# Merge datasets
X = pd.concat([X, ats_cols], axis=1)
pipeline = Pipeline([
('cluster_maker', ClusterMaker()),
('clf', RandomForestClassifier(random_state=0, class_weight="balanced"))
])
param_grid = [
{
'cluster_maker__init': ['random', 'Huang', 'Cao'],
'cluster_maker__n_clusters': [5, 10, 15, 20, 30],
'cluster_maker__n_init': [1, 5, 10, 15, 20]
}
]
scoring = 'average_precision'
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
clf = GridSearchCV(pipeline, param_grid=param_grid,
scoring=scoring, cv=skf)
clf.fit(X, y)
print('\nAll results:')
print(clf.cv_results_)
print('\nBest estimator:')
print(clf.best_estimator_)
print('\nBest score:')
print(clf.best_score_)
print('\nBest hyperparameters:')
print(clf.best_params_)
results = pd.DataFrame(clf.cv_results_)
file_writer.write_csv(results, pt.REPORTS_DIR, 'kmodes-settings-grid-search-results.csv')
if __name__ == '__main__':
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment