Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Fischer Pedersen
AIR
Commits
5b4d30fd
Commit
5b4d30fd
authored
Aug 27, 2020
by
Christian Marius Lillelund
Browse files
Changed cluster settings
parent
7b8b4cbe
Pipeline
#26451
failed with stage
in 2 minutes and 5 seconds
Changes
13
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
notebooks/Model_Completes.ipynb
View file @
5b4d30fd
This diff is collapsed.
Click to expand it.
notebooks/Model_Completes_ATS.ipynb
View file @
5b4d30fd
This diff is collapsed.
Click to expand it.
src/clustering/cluster_maker.py
View file @
5b4d30fd
...
...
@@ -27,14 +27,20 @@ def create_clusters(hu):
print
(
"Unique clusters type E after sorting for cluster sizes greater than {}: {}"
.
format
(
clusterthr
,
cluster_guess
.
shape
))
# Train
# Train
model
model
=
kmd
.
k_modes_train
(
data
,
init
=
cluster_guess
.
values
)
# model = load_model()
# Make predictions
data
[
'Cluster'
]
=
pd
.
Series
(
model
.
predict
(
data
.
to_numpy
()),
index
=
data
.
index
)
data
=
data
[[
'Cluster'
]]
data
.
index
=
data
.
index
.
set_names
([
'CitizenId'
])
data
.
to_csv
(
Path
.
joinpath
(
cfg
.
INTERIM_DATA_DIR
,
'clusters.csv'
),
index
=
True
)
# Save the ATS sequence
data
[
'ATS'
]
=
data
[
data
.
columns
[
0
:
23
]].
apply
(
lambda
x
:
','
.
join
(
x
.
dropna
().
astype
(
str
)),
axis
=
1
)
# Save clusters
data
=
data
.
reset_index
()[[
'CitizenId'
,
'Cluster'
,
'ATS'
]]
data
.
to_csv
(
Path
.
joinpath
(
cfg
.
INTERIM_DATA_DIR
,
'clusters.csv'
),
index
=
False
)
return
model
def
create_vectors
(
hu
):
...
...
src/clustering/kmodes_clf.py
View file @
5b4d30fd
...
...
@@ -18,64 +18,21 @@ def k_modes_train(hu, init='Random'):
:return: The fitted kModels object.
"""
# Variable to keep track of the results of the search
performances
=
pd
.
DataFrame
()
# Methods "Cao" / "huang" / "random"
#TODO: Set 1 init and find a range that works
n_inits
=
range
(
1
,
2
)
# Number of time the k-modes algorithm will be run with different centroid seeds
# The number of clusters to form as well as the number of centroids to generate.
#TODO: Run once.
n_cluster_range
=
range
(
4
,
5
)
if
isinstance
(
init
,
str
)
else
range
(
init
.
shape
[
0
],
init
.
shape
[
0
]
+
1
)
# Metrics on the fitness of the clustering Tries
best_cost
=
None
best_k_modes
=
None
for
n
in
n_inits
:
for
n_cluster
in
n_cluster_range
:
# Initialize kModes object
km
=
KModes
(
init
=
init
,
n_clusters
=
n_cluster
,
n_init
=
n
,
cat_dissim
=
ut
.
dubbah_dissim
)
# Perform k-Modes
km
.
fit_predict
(
hu
)
print
(
'test of print cluster centroids'
)
print
(
km
.
cluster_centroids_
)
# Prepare a summary of the model fitness
performance
=
pd
.
DataFrame
([{
"init"
:
init
,
"n_init_param"
:
n_inits
,
"n_clusters"
:
n_cluster
,
"n_iter_att"
:
km
.
n_iter_
,
"cost"
:
km
.
cost_
,
"centroids"
:
km
.
cluster_centroids_
,
"numberInEachCluster"
:
np
.
unique
(
km
.
labels_
,
return_counts
=
True
)[
1
]
}])
performances
=
pd
.
concat
([
performances
,
performance
],
axis
=
0
)
# Update the best model across initialization methods
if
best_cost
is
None
or
km
.
cost_
<
best_cost
:
best_k_modes
=
km
# Saves the best model for each initialization method as a serialized kMOdes object
init
=
'Self'
if
hasattr
(
init
,
'__array__'
)
else
init
with
open
(
str
(
Path
.
joinpath
(
INTERIM_DATA_DIR
,
'kmodescluster'
+
init
+
'init.pkl'
)),
'wb'
)
as
output
:
pickle
.
dump
(
best_k_modes
,
output
,
pickle
.
HIGHEST_PROTOCOL
)
print
(
"init: {}, init range: {}, n_cluster = {}"
.
format
(
init
,
n_inits
,
n_cluster_range
))
# Initialize kModes
km
=
KModes
(
init
=
init
,
n_clusters
=
init
.
shape
[
0
],
n_init
=
2
,
cat_dissim
=
ut
.
dubbah_dissim
)
#km = KModes(init='random', n_clusters=2, n_init=1, cat_dissim=ut.dubbah_dissim)
# Perform k-Modes
km
.
fit_predict
(
hu
)
n_cluster_range
=
range
(
500
,
501
)
# range(1874, 1875) # The number of clusters to form as well as the number of centroids to generate.
return
best_k_modes
return
km
def
k_modes_predict
(
vector
,
km
=
None
,
readserialized
:
bool
=
True
,
init
:
str
=
'self'
):
...
...
@@ -96,7 +53,6 @@ def k_modes_predict(vector, km=None, readserialized: bool = True, init: str = 's
n_att
=
len
(
km
.
cluster_centroids_
[
0
])
# 28
vector
=
np
.
append
(
np
.
array
(
vector
),
np
.
array
([
0
]
*
(
n_att
-
len
(
vector
)))).
reshape
(
1
,
-
1
)
#TODO: Check that seq is a number of devices
cluster
=
km
.
predict
(
vector
)
# [0933 0 0 0 ...0]
seq
=
km
.
cluster_centroids_
[
cluster
]
# 77
return
seq
...
...
src/clustering/utility.py
View file @
5b4d30fd
...
...
@@ -6,7 +6,7 @@ from src.config import *
def
dubbah_dissim
(
a
,
b
,
X
=
None
,
membship
=
None
):
"""
Utility function for our
personalized
disimmilarity measure following the API on:
Utility function for our disimmilarity measure following the API on:
#https://github.com/nicodv/kmodes/blob/master/kmodes/util/dissim.py.
Eg.
...
...
@@ -25,7 +25,7 @@ def dubbah_dissim(a, b, X=None, membship=None):
[0, 1, 2]
[0, 1, 0] = 1
Similarity =
sum(Step1, Step2, step3) = 4
Similarity = sum(Step1, Step2, step3) = 4
:param verbose: A boolean whether to print the the params
:param a: collection of cluster centroids numpy array with shape (centroids, attributes)
...
...
src/config.py
View file @
5b4d30fd
import
json
import
os
from
pathlib
import
Path
FILE_PATHS
=
[
'DigiRehab_BorgerID_TrainingDone.xlsx'
,
...
...
src/get_citizen_clusters.py
0 → 100644
View file @
5b4d30fd
#!/usr/bin/env python
"""
Authors: Cecilie Moriat, Tenna Rasmussen, Christian Fischer Pedersen
Date: 20th March, 2020
"""
import
src.clustering.cluster_maker
as
cluster_maker
import
src.clustering.kmodes_clf
as
kmd
import
src.data.parser
as
parser
import
src.data.cleaner
as
cleaner
import
src.utility.helper_func
as
hf
import
src.data.file_reader
as
fr
import
src.data.file_writer
as
fw
import
src.config
as
cfg
import
src.log
as
log
import
pandas
as
pd
import
os
def
main
():
clusters
=
fr
.
read_csv
(
cfg
.
INTERIM_DATA_DIR
,
'clusters.csv'
)
timeseries
=
fr
.
read_csv
(
cfg
.
INTERIM_DATA_DIR
,
'timeseries.csv'
)
df
=
clusters
.
loc
[
clusters
.
CitizenId
.
isin
(
list
(
timeseries
.
CitizenId
))]
fw
.
write_csv
(
df
,
cfg
.
INTERIM_DATA_DIR
,
'citizen_clusters.csv'
)
if
__name__
==
'__main__'
:
main
()
src/make_timeseries.py
View file @
5b4d30fd
...
...
@@ -58,7 +58,7 @@ def run():
# Create timeseries features and save it
data
=
data_dto
.
Data
(
patient_data
,
screening_values
,
status_set
,
training_done
,
training_cancelled
,
assistive_aids
,
clusters
)
training_done
,
training_cancelled
,
assistive_aids
,
clusters
)
features
=
fm
.
make_timeseries_features
(
data
)
file_writer
.
write_csv
(
features
,
cfg
.
INTERIM_DATA_DIR
,
'timeseries.csv'
)
...
...
src/model_completes.py
View file @
5b4d30fd
...
...
@@ -52,7 +52,7 @@ def run():
train_rf
(
X
,
y
)
def
train_rf
(
X
,
y
):
clf
=
clfs
.
get_classifier
(
"R
andom Forest
"
)
clf
=
clfs
.
get_classifier
(
"R
F
"
)
mean_auc
,
std_auc
,
mean_acc
,
cm
,
model
=
cv
.
make_cross_val
(
clf
,
X
,
y
)
print
(
f
"Mean AUC:
{
np
.
round
(
mean_auc
,
3
)
}
"
)
print
(
f
"Std AUC:
{
np
.
round
(
std_auc
,
3
)
}
"
)
...
...
src/model_ats.py
→
src/model_
completes_
ats.py
View file @
5b4d30fd
...
...
@@ -45,7 +45,7 @@ def run():
train_rf
(
X
,
y
)
def
train_rf
(
X
,
y
):
clf
=
clfs
.
get_classifier
(
"R
andom Forest
"
)
clf
=
clfs
.
get_classifier
(
"
L
R"
)
mean_auc
,
std_auc
,
mean_acc
,
cm
,
model
=
cv
.
make_cross_val
(
clf
,
X
,
y
)
print
(
f
"Mean AUC:
{
np
.
round
(
mean_auc
,
3
)
}
"
)
print
(
f
"Std AUC:
{
np
.
round
(
std_auc
,
3
)
}
"
)
...
...
src/model_needs.py
View file @
5b4d30fd
...
...
@@ -48,7 +48,7 @@ def run():
train_rf
(
X
,
y
)
def
train_rf
(
X
,
y
):
clf
=
clfs
.
get_classifier
(
"R
andom Forest
"
)
clf
=
clfs
.
get_classifier
(
"R
F
"
)
mean_auc
,
std_auc
,
mean_acc
,
cm
,
model
=
cv
.
make_cross_val
(
clf
,
X
,
y
)
print
(
f
"Mean AUC:
{
np
.
round
(
mean_auc
,
3
)
}
"
)
print
(
f
"Std AUC:
{
np
.
round
(
std_auc
,
3
)
}
"
)
...
...
src/models/classifiers.py
View file @
5b4d30fd
...
...
@@ -3,9 +3,9 @@ from sklearn.ensemble import RandomForestClassifier
def
get_classifier
(
name
,
random_state
=
0
,
n_jobs
=
None
):
classifiers
=
{
'R
andom Forest
'
:
RandomForestClassifier
(
n_estimators
=
400
,
'R
F
'
:
RandomForestClassifier
(
n_estimators
=
400
,
class_weight
=
'balanced'
,
n_jobs
=
n_jobs
,
random_state
=
random_state
),
'L
ogistic Regression
'
:
LogisticRegression
(
solver
=
"liblinear"
,
'L
R
'
:
LogisticRegression
(
solver
=
"liblinear"
,
class_weight
=
'balanced'
,
n_jobs
=
n_jobs
,
random_state
=
random_state
)
}
return
classifiers
[
name
]
\ No newline at end of file
src/models/cross_validator.py
View file @
5b4d30fd
...
...
@@ -8,6 +8,7 @@ import src.data.file_writer as file_writer
import
src.config
as
cfg
import
src.log
as
log
import
os
from
sklearn.preprocessing
import
StandardScaler
logger
=
log
.
setup_logger
(
os
.
path
.
basename
(
__file__
))
...
...
@@ -18,14 +19,17 @@ def make_cross_val(clf, X, y, n_splits=5, shuffle=True, random_state=0):
total_confusion_matrix
=
np
.
zeros
(
shape
=
(
2
,
2
))
cv
=
StratifiedKFold
(
n_splits
,
shuffle
,
random_state
)
sc
=
StandardScaler
()
for
train_index
,
test_index
in
cv
.
split
(
X
,
y
):
logger
.
debug
(
f
'Running CV for
{
train_index
}
and
{
test_index
}
'
)
# Make train/test split
X_train
,
y_train
=
X
.
iloc
[
train_index
],
y
.
iloc
[
train_index
]
X_test
,
y_test
=
X
.
iloc
[
test_index
],
y
.
iloc
[
test_index
]
# Scale data
X_train
=
sc
.
fit_transform
(
X_train
)
X_test
=
sc
.
transform
(
X_test
)
# Train and get predictions
model
,
y_pred
,
y_pred_proba
=
train_and_predict
(
clf
,
X_train
,
X_test
,
y_train
)
...
...
@@ -42,7 +46,6 @@ def make_cross_val(clf, X, y, n_splits=5, shuffle=True, random_state=0):
# Get normalized CM and add to total
cm
=
met
.
get_confusion_matrix
(
y_test
,
y_pred
)
total_confusion_matrix
=
np
.
add
(
total_confusion_matrix
,
cm
)
logger
.
debug
(
f
'Completed CV run for
{
train_index
}
and
{
test_index
}
'
)
# Compute collective results
mean_model_auc
=
mean
(
model_results_auc
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment