device_clusters.py 7.12 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
"""
Methods to train a kModes algorithm to cluster  discrete loan sequence patterns

Authors: Jeppe Gravgaard, Hugo Daniel Macedo and Christian Fischer Pedersen

Date: 21th March, 2019.
"""
# local imports
import pickle
from pathlib import Path

# external imports
from kmodes.kmodes import KModes
import numpy as np

import pandas as pd
17
from src.clustering.clean import *
18
from src.config import *
19

20
def create_clusters(hu, test_mode=False):
21
22
    hu[DEV_ISO_CLASS] = hu[DEV_ISO_CLASS].apply(lambda ik: str(ik)[0:4])
    hu[DEV_ISO_CLASS] = pd.Categorical(hu[DEV_ISO_CLASS])
23
24
25
    hu = filter_duplicated_devices(hu)
    data = create_vectors(hu)
    create_hash_sequences(hu)
26

27
    cl_guess, lod_pure = init_clusters()
28
29
30
31
32
33
34
35
    # choose initial clusters to be greater than a threshold
    clusterthr = 100
    cluster_guess = cl_guess.loc[lod_pure[lod_pure['Total'] >= clusterthr].index].replace(np.nan, 0)

    print("Unique clusters type E after sorting for cluster sizes greater than {}: {}".
          format(clusterthr, cluster_guess.shape))

    # Train
36
    model = k_modes_train(data, test_mode, init=cluster_guess.values)
37
38
39
    # model = load_model()
    data['Cluster'] = pd.Series(model.predict(data.to_numpy()), index=data.index)
    data = data[['Cluster']]
40
41

    if not test_mode:
42
        data.to_csv(Path.joinpath(INTERIM_DATA_DIR, 'clusters.csv'), index=True)
43
44
45
    return model


46
def k_modes_train(hu, test_mode=False, init='Random'):
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    """
    Train a kModes classifier.

    docs: https://github.com/nicodv/kmodes/blob/master/kmodes/kmodes.py
    literature = https://grid.cs.gsu.edu/~wkim/index_files/papers/kprototype.pdf

    :param init: 'random' for a pure k modes approach
    :param hu: A pd.DataFrame with the discrete loan sequences patterns BorgerID = [Device1, Device2, ..., LastDevice]

    :return: The fitted kModels object.
    """

    # Variable to keep track of the results of the search
    performances = pd.DataFrame()

    # Methods "Cao" / "huang" / "random"
    n_inits = range(1, 2)  # Number of time the k-modes algorithm will be run with different centroid seeds

    # The number of clusters to form as well as the number of centroids to generate.
    n_cluster_range = range(4, 5) if isinstance(init, str) else range(init.shape[0], init.shape[0] + 1)

    # Metrics on the fitness of the clustering Tries
    best_cost = None
    best_k_modes = None

    for n in n_inits:
        for n_cluster in n_cluster_range:
            # Initialize kModes object
            km = KModes(init=init,
                        n_clusters=n_cluster,
                        n_init=n,
                        cat_dissim=dubbah_dissim
                        )

            # Perform k-Modes
            km.fit_predict(hu)
            print('test of print cluster centroids')
            print(km.cluster_centroids_)

            # Prepare a summary of the model fitness

            performance = pd.DataFrame([{"init": init,
                                         "n_init_param": n_inits,
                                         "n_clusters": n_cluster,
                                         "n_iter_att": km.n_iter_,
                                         "cost": km.cost_,
                                         "centroids": km.cluster_centroids_,
                                         "numberInEachCluster": np.unique(km.labels_, return_counts=True)[1]
                                         }])

            performances = pd.concat([performances, performance], axis=0)

            # Update the best model across initialization methods
            if best_cost is None or km.cost_ < best_cost:
                best_k_modes = km

        # Saves the best model for each initialization method as a serialized kMOdes object
        init = 'Self' if hasattr(init, '__array__') else init
105
106

        if not test_mode:
107
            with open(str(Path.joinpath(INTERIM_DATA_DIR, 'kmodescluster' + init + 'init2.pkl')), 'wb') as output:
108
                pickle.dump(best_k_modes, output, pickle.HIGHEST_PROTOCOL)
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

        print("init: {}, init range: {}, n_cluster = {}".format(init, n_inits, n_cluster_range))

    n_cluster_range = range(500, 501)
    # range(1874, 1875)  # The number of clusters to form as well as the number of centroids to generate.
    return best_k_modes


def k_modes_predict(vector, km=None, readserialized: bool = True, init: str = 'self'):
    """
    Predicts the device sequence for a given citizen with the historie given in vector.

    This is done by using either parsin a kModes object or reading a serilized version from disk.

    :type vector: list
    :param vector: vector of the discrete device sequence patterns
    :param km: A trained kModes object
    :param readserialized: Bool whether to read the serialized object from the disk.
    :return: The predicted sequence as a list of Strings

    """

    km = load_model()

    n_att = len(km.cluster_centroids_[0]) # 28
    vector = np.append(np.array(vector), np.array([0] * (n_att - len(vector)))).reshape(1, -1)

    cluster = km.predict(vector) # [0933 0 0 0 ...0]
    seq = km.cluster_centroids_[cluster] # 77
    return seq


def k_modes_test(data, init: str = 'self'):
    """
    Tests the predict functions of the kmodes

    :param init: The type of initialization method used in the kModes object
    :return:
    """
    km = load_model()

    seq = k_modes_predict([], km)

    print("Test kModes Algorithm: ")
    print("kModes initialization method: ", km.init)
    if init is 'Bool':
        for cluster in km.cluster_centroids_:
            print("columns:", data.columns.values[np.where(cluster == '1')])
    else:
        print("kModes centroids:", km.cluster_centroids_)
    print("Predicted sequence: ", seq)

    return


def dubbah_dissim(a, b, X=None, membship=None, verbose: bool = False):
    """
    Utility function for our personalized disimmilarity measure following the API on:
    #https://github.com/nicodv/kmodes/blob/master/kmodes/util/dissim.py.

    Eg.
    a = [0, 1, 2] and b = [0, 1, 0]
    dubbah_dissim (a,b) would go through 4 steps

    Step 1:
    [0, 1, 2]
    ......[0, 1, 0] = 1

    Step 2:
    [0, 1, 2]
    ...[0, 1, 0] = 2

    Step 3:
    [0, 1, 2]
    [0, 1, 0] = 1

    Similarity =  sum(Step1, Step2, step3) = 4

    :param verbose: A boolean whether to print the the params
    :param a: collection of cluster centroids numpy array with shape (centroids, attributes)
    :param b: The vector for one specific citizen - numpy array with shape (attributes)
    :param X: The complete dataset used for training - numpy array with shape(npoints, nattributes
    :param membship: np.array an indicator array mapping points -> clusters
    :return: The dissimilarity measure for each cluster
    """
    if verbose:
        print(b.ndim, b.shape, b[0])
        print(a.ndim, a.shape)

    # The number of attributes
    n_att = b.shape[b.ndim - 1]

    return np.array([np.array([np.sum(cc[-i:] != b[:i]) if i != 0 else np.sum(cc != b)
                               for i in range(0, n_att)]).sum()
                     for idj, cc in enumerate(a)])

def load_model():
206
    with open(str(Path.joinpath(INTERIM_DATA_DIR, 'kmodesclusterSelfinit.pkl')), 'rb') as input:
207
208
        model = pickle.load(input)
    return model