Commit 65bd6ef3 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

improved readability

parent 6b4b6054
Pipeline #45731 passed with stage
in 3 minutes and 20 seconds
......@@ -29,7 +29,7 @@ def main():
n_estimators = [100, 200, 400, 800]
max_features = [4, 8, 20, 36]
min_samples_leaf = [5, 10, 20, 40]
kernel = ['linear', 'poly', 'rbf']
kernel = ['linear', 'poly', 'rbf']
param_grid = [
{
......
......@@ -5,15 +5,11 @@ import numpy as np
import pandas as pd
import config as cfg
import log as log
from tools import file_reader, file_writer
from utility import dataset, cluster_maker, kmodes_wrapper
from tools import file_reader, file_writer, cluster_maker
from utility import dataset, kmodes_wrapper
def main():
logger = log.setup_logger(os.path.basename(__file__))
ats = file_reader.read_pickle(cfg.INTERIM_DATA_DIR, 'ats.pkl')
ids = dataset.create_union_of_ids(ats)
data = cluster_maker.make_cluster_data(ats)
model = kmodes_wrapper.fit_classifier(data.iloc[:, 1:], init='random', n_clusters=32, n_init=8)
......
......@@ -10,7 +10,7 @@ from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tools import explainer
dataset = 'complete_with_count.csv'
dataset = 'complete_with_embeddings.csv'
SAVE_MODEL = True
EXPLAIN = True
......@@ -51,7 +51,7 @@ def main():
history = model.fit(X_train, y_train,
validation_data=(X_test, y_test),
epochs=50, batch_size=64, verbose=0)
epochs=50, batch_size=16, verbose=0)
print("Valid accuracy: " + str([round(score, 3) for score in history.history['val_accuracy']]))
print("Valid precision: " + str([round(score, 3) for score in history.history['val_precision']]))
......
......@@ -2,39 +2,39 @@ import config as cfg
import pandas as pd
import numpy as np
import os
import abc
from abc import ABC, abstractmethod
from typing import List
class BaseCleaner(metaclass=abc.ABCMeta):
@abc.abstractmethod
class BaseCleaner(ABC):
@abstractmethod
def clean_clusters(self, cl):
"""Cleans the cluster data set"""
@abc.abstractmethod
@abstractmethod
def clean_patient_data(self, ptd):
"""Cleans the patient data set"""
@abc.abstractmethod
@abstractmethod
def clean_screening_content(self, sc, ptd):
"""Cleans the screening content data set"""
@abc.abstractmethod
@abstractmethod
def clean_status_set(self, ss, ptd):
"""Cleans the status set data set"""
@abc.abstractmethod
@abstractmethod
def clean_training_done(self, td, ptd):
"""Cleans the training done data set"""
@abc.abstractmethod
@abstractmethod
def clean_training_cancelled(self, tc, ptd):
"""Cleans the training cancelled data set"""
@abc.abstractmethod
@abstractmethod
def clean_assistive_aids(self, ats, ic, ids):
"""Cleans the assistive aids data set"""
@abc.abstractmethod
@abstractmethod
def clean_fall_data(self, fd):
"""Cleans the fall set"""
......
......@@ -81,10 +81,24 @@ def init_clusters():
mlb = MultiLabelBinarizer()
lod = pd.DataFrame(mlb.fit_transform(df.lod), columns=mlb.classes_, index=df.index)
print("Longest sequence of devices: {}. Shortest: {}".format(lod.sum(axis=1).max(), lod.sum(axis=1).min()))
################################################################################################
################# MERGE CLUSTERS with same devices but different order START ###################
################################################################################################
# merge_clusters_with_same_devices
lod_pure = merge_clusters_with_same_devices(lod, df)
# create discrete device vector
cls_guess = df.iloc[lod_pure.index]['lod']
cls_guess = pd.DataFrame(cls_guess.tolist())
cls_guess.index = lod_pure.index
#Save to disk
lod_pure.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, "citizenid_md5_sha256_count_filtered.csv"))
cls_guess.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, "clustering_groups_hash_filtered_discrete.csv"))
print("shape cluster guess: ", cls_guess.shape)
return cls_guess, lod_pure
def merge_clusters_with_same_devices(lod, df):
#find the number of times a device appears in a list of devices
device_sum = lod.sum(axis=0)
......@@ -114,23 +128,7 @@ def init_clusters():
lod = lod.sort_values(cfg.CITIZEN_ID)
lod_pure = lod.drop_duplicates(subset=lod.columns.difference([cfg.CITIZEN_ID, 'Total']), keep='first')
print("Unique LOD after removing duplicated devices: ", lod_pure.shape)
################################################################################################
################## MERGE CLUSTERS with same devices but different order END ####################
################################################################################################
# create discrete device vector
cls_guess = df.iloc[lod_pure.index]['lod']
cls_guess = pd.DataFrame(cls_guess.tolist())
cls_guess.index = lod_pure.index
#Save to disk
lod_pure.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, "citizenid_md5_sha256_count_filtered.csv"))
cls_guess.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, "clustering_groups_hash_filtered_discrete.csv"))
print("shape cluster guess: ", cls_guess.shape)
return cls_guess, lod_pure
return lod_pure
def calculate_disimmilarity(a, b, X=None, membship=None):
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment