Commit 5b4d30fd authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

Changed cluster settings

parent 7b8b4cbe
Pipeline #26451 failed with stage
in 2 minutes and 5 seconds
This diff is collapsed.
This diff is collapsed.
......@@ -27,14 +27,20 @@ def create_clusters(hu):
print("Unique clusters type E after sorting for cluster sizes greater than {}: {}"
.format(clusterthr, cluster_guess.shape))
# Train
# Train model
model = kmd.k_modes_train(data, init=cluster_guess.values)
# model = load_model()
# Make predictions
data['Cluster'] = pd.Series(model.predict(data.to_numpy()), index=data.index)
data = data[['Cluster']]
data.index = data.index.set_names(['CitizenId'])
data.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, 'clusters.csv'), index=True)
# Save the ATS sequence
data['ATS'] = data[data.columns[0:23]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
# Save clusters
data = data.reset_index()[['CitizenId', 'Cluster', 'ATS']]
data.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, 'clusters.csv'), index=False)
return model
def create_vectors(hu):
......
......@@ -18,64 +18,21 @@ def k_modes_train(hu, init='Random'):
:return: The fitted kModels object.
"""
# Variable to keep track of the results of the search
performances = pd.DataFrame()
# Methods "Cao" / "huang" / "random"
#TODO: Set 1 init and find a range that works
n_inits = range(1, 2) # Number of time the k-modes algorithm will be run with different centroid seeds
# The number of clusters to form as well as the number of centroids to generate.
#TODO: Run once.
n_cluster_range = range(4, 5) if isinstance(init, str) else range(init.shape[0], init.shape[0] + 1)
# Metrics on the fitness of the clustering Tries
best_cost = None
best_k_modes = None
for n in n_inits:
for n_cluster in n_cluster_range:
# Initialize kModes object
km = KModes(init=init,
n_clusters=n_cluster,
n_init=n,
cat_dissim=ut.dubbah_dissim
)
# Perform k-Modes
km.fit_predict(hu)
print('test of print cluster centroids')
print(km.cluster_centroids_)
# Prepare a summary of the model fitness
performance = pd.DataFrame([{"init": init,
"n_init_param": n_inits,
"n_clusters": n_cluster,
"n_iter_att": km.n_iter_,
"cost": km.cost_,
"centroids": km.cluster_centroids_,
"numberInEachCluster": np.unique(km.labels_, return_counts=True)[1]
}])
performances = pd.concat([performances, performance], axis=0)
# Update the best model across initialization methods
if best_cost is None or km.cost_ < best_cost:
best_k_modes = km
# Saves the best model for each initialization method as a serialized kMOdes object
init = 'Self' if hasattr(init, '__array__') else init
with open(str(Path.joinpath(INTERIM_DATA_DIR, 'kmodescluster' + init + 'init.pkl')), 'wb') as output:
pickle.dump(best_k_modes, output, pickle.HIGHEST_PROTOCOL)
print("init: {}, init range: {}, n_cluster = {}".format(init, n_inits, n_cluster_range))
# Initialize kModes
km = KModes(init=init, n_clusters=init.shape[0], n_init=2, cat_dissim=ut.dubbah_dissim)
#km = KModes(init='random', n_clusters=2, n_init=1, cat_dissim=ut.dubbah_dissim)
# Perform k-Modes
km.fit_predict(hu)
n_cluster_range = range(500, 501)
# range(1874, 1875) # The number of clusters to form as well as the number of centroids to generate.
return best_k_modes
return km
def k_modes_predict(vector, km=None, readserialized: bool = True, init: str = 'self'):
......@@ -96,7 +53,6 @@ def k_modes_predict(vector, km=None, readserialized: bool = True, init: str = 's
n_att = len(km.cluster_centroids_[0]) # 28
vector = np.append(np.array(vector), np.array([0] * (n_att - len(vector)))).reshape(1, -1)
#TODO: Check that seq is a number of devices
cluster = km.predict(vector) # [0933 0 0 0 ...0]
seq = km.cluster_centroids_[cluster] # 77
return seq
......
......@@ -6,7 +6,7 @@ from src.config import *
def dubbah_dissim(a, b, X=None, membship=None):
"""
Utility function for our personalized disimmilarity measure following the API on:
Utility function for our disimmilarity measure following the API on:
#https://github.com/nicodv/kmodes/blob/master/kmodes/util/dissim.py.
Eg.
......@@ -25,7 +25,7 @@ def dubbah_dissim(a, b, X=None, membship=None):
[0, 1, 2]
[0, 1, 0] = 1
Similarity = sum(Step1, Step2, step3) = 4
Similarity = sum(Step1, Step2, step3) = 4
:param verbose: A boolean whether to print the the params
:param a: collection of cluster centroids numpy array with shape (centroids, attributes)
......
import json
import os
from pathlib import Path
FILE_PATHS = ['DigiRehab_BorgerID_TrainingDone.xlsx',
......
#!/usr/bin/env python
"""
Authors: Cecilie Moriat, Tenna Rasmussen, Christian Fischer Pedersen
Date: 20th March, 2020
"""
import src.clustering.cluster_maker as cluster_maker
import src.clustering.kmodes_clf as kmd
import src.data.parser as parser
import src.data.cleaner as cleaner
import src.utility.helper_func as hf
import src.data.file_reader as fr
import src.data.file_writer as fw
import src.config as cfg
import src.log as log
import pandas as pd
import os
def main():
clusters = fr.read_csv(cfg.INTERIM_DATA_DIR, 'clusters.csv')
timeseries = fr.read_csv(cfg.INTERIM_DATA_DIR, 'timeseries.csv')
df = clusters.loc[clusters.CitizenId.isin(list(timeseries.CitizenId))]
fw.write_csv(df, cfg.INTERIM_DATA_DIR, 'citizen_clusters.csv')
if __name__ == '__main__':
main()
......@@ -58,7 +58,7 @@ def run():
# Create timeseries features and save it
data = data_dto.Data(patient_data, screening_values, status_set,
training_done, training_cancelled, assistive_aids, clusters)
training_done, training_cancelled, assistive_aids, clusters)
features = fm.make_timeseries_features(data)
file_writer.write_csv(features, cfg.INTERIM_DATA_DIR, 'timeseries.csv')
......
......@@ -52,7 +52,7 @@ def run():
train_rf(X, y)
def train_rf(X, y):
clf = clfs.get_classifier("Random Forest")
clf = clfs.get_classifier("RF")
mean_auc, std_auc, mean_acc, cm, model = cv.make_cross_val(clf, X, y)
print(f"Mean AUC: {np.round(mean_auc, 3)}")
print(f"Std AUC: {np.round(std_auc, 3)}")
......
......@@ -45,7 +45,7 @@ def run():
train_rf(X, y)
def train_rf(X, y):
clf = clfs.get_classifier("Random Forest")
clf = clfs.get_classifier("LR")
mean_auc, std_auc, mean_acc, cm, model = cv.make_cross_val(clf, X, y)
print(f"Mean AUC: {np.round(mean_auc, 3)}")
print(f"Std AUC: {np.round(std_auc, 3)}")
......
......@@ -48,7 +48,7 @@ def run():
train_rf(X, y)
def train_rf(X, y):
clf = clfs.get_classifier("Random Forest")
clf = clfs.get_classifier("RF")
mean_auc, std_auc, mean_acc, cm, model = cv.make_cross_val(clf, X, y)
print(f"Mean AUC: {np.round(mean_auc, 3)}")
print(f"Std AUC: {np.round(std_auc, 3)}")
......
......@@ -3,9 +3,9 @@ from sklearn.ensemble import RandomForestClassifier
def get_classifier(name, random_state=0, n_jobs=None):
classifiers = {
'Random Forest': RandomForestClassifier(n_estimators=400,
'RF': RandomForestClassifier(n_estimators=400,
class_weight='balanced', n_jobs=n_jobs, random_state=random_state),
'Logistic Regression': LogisticRegression(solver="liblinear",
'LR': LogisticRegression(solver="liblinear",
class_weight='balanced', n_jobs=n_jobs, random_state=random_state)
}
return classifiers[name]
\ No newline at end of file
......@@ -8,6 +8,7 @@ import src.data.file_writer as file_writer
import src.config as cfg
import src.log as log
import os
from sklearn.preprocessing import StandardScaler
logger = log.setup_logger(os.path.basename(__file__))
......@@ -18,14 +19,17 @@ def make_cross_val(clf, X, y, n_splits=5, shuffle=True, random_state=0):
total_confusion_matrix = np.zeros(shape=(2, 2))
cv = StratifiedKFold(n_splits, shuffle, random_state)
sc = StandardScaler()
for train_index, test_index in cv.split(X, y):
logger.debug(f'Running CV for {train_index} and {test_index}')
# Make train/test split
X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_test, y_test = X.iloc[test_index], y.iloc[test_index]
# Scale data
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
# Train and get predictions
model, y_pred, y_pred_proba = train_and_predict(clf, X_train, X_test, y_train)
......@@ -42,7 +46,6 @@ def make_cross_val(clf, X, y, n_splits=5, shuffle=True, random_state=0):
# Get normalized CM and add to total
cm = met.get_confusion_matrix(y_test, y_pred)
total_confusion_matrix = np.add(total_confusion_matrix, cm)
logger.debug(f'Completed CV run for {train_index} and {test_index}')
# Compute collective results
mean_model_auc = mean(model_results_auc)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment