cluster_maker.py 7.33 KB
Newer Older
1
import pickle
2
3
4
5
from pathlib import Path
from kmodes.kmodes import KModes
import numpy as np
import pandas as pd
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import ast
import hashlib
from sklearn.preprocessing import MultiLabelBinarizer
import src.config as cfg
import src.clustering.kmodes_clf as kmd
import src.clustering.cleaner as cleaner
from io import StringIO
from csv import writer

def create_clusters(hu):
    hu[cfg.DEV_ISO_CLASS] = hu[cfg.DEV_ISO_CLASS].apply(lambda ik: str(ik)[0:4])
    hu[cfg.DEV_ISO_CLASS] = pd.Categorical(hu[cfg.DEV_ISO_CLASS])
    hu = cleaner.clean_devices(hu)
    data = create_vectors(hu)
    create_hash_sequences(hu)

    cl_guess, lod_pure = init_clusters()
    # choose initial clusters to be greater than a threshold
    clusterthr = 100
    cluster_guess = cl_guess.loc[lod_pure[lod_pure['Total'] >= clusterthr].index].replace(np.nan, 0)

    print("Unique clusters type E after sorting for cluster sizes greater than {}: {}"
    .format(clusterthr, cluster_guess.shape))

30
    # Train model
31
    model = kmd.k_modes_train(data, init=cluster_guess.values)
32
33

    # Make predictions
34
35
    data['Cluster'] = pd.Series(model.predict(data.to_numpy()), index=data.index)
    data.index = data.index.set_names(['CitizenId'])
36
37
38
39
40
41
42
43

    # Save the ATS sequence
    data['ATS'] = data[data.columns[0:23]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)

    # Save clusters
    data = data.reset_index()[['CitizenId', 'Cluster', 'ATS']]
    data.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, 'clusters.csv'), index=False)

44
    return model
45
46
47
48
49
50
51
52
53

def create_vectors(hu):
    """
    Create the discrete loan sequaence patterns for the hu data.

    :param hu: HU dataframed cleaned and filtered
    :return: pd.DataFrame with the discrete loan sequences.
    """

54
55
    max_len = hu.groupby(cfg.CITIZEN_ID)[cfg.DEV_ISO_CLASS].size().max()  # Find the patient with the biggest history of devices
    hu = hu[[cfg.CITIZEN_ID, cfg.DEV_ISO_CLASS]]  # Throw away unused columns
56
57
58
59

    # Create timelines
    # https://stackoverflow.com/questions/41888080/python-efficient-way-to-add-rows-to-dataframe
    # Dict{CitizenId : pd.DataFrame}
60
    grouped = dict(list(hu.groupby(cfg.CITIZEN_ID,
61
62
63
64
65
66
67
68
69
70
                                   axis=0,
                                   observed=True)))
    # discrete = grouped.apply(lambda x: np.append(np.append(x.values), np.array([0] * (max_len - len(x.values)))))

    output = StringIO()
    csv_writer = writer(output)

    for key, vals in grouped.items():
        # creates a vector of the devices and fills it with 0s matching  max_len = | [a, b, c, 0, 0, 0...]|
        csv_writer.writerow(
71
            np.append(np.append(key, vals[cfg.DEV_ISO_CLASS]), np.array([0] * (max_len - len(vals[cfg.DEV_ISO_CLASS])))))
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

    output.seek(0)  # goes to the start of the IO file
    df_discrete = pd.read_csv(output,
                              header=None,
                              index_col=0,
                              dtype=str)
    return df_discrete


def create_hash_sequences(hu):
    """
    Find identical loan sequence patterns referred to as hash-sequences in the data set.

    :param hu: A pd.DataFrame with Hu-data

    """

89
90
91
    hu = hu[[cfg.CITIZEN_ID, cfg.DEV_ISO_CLASS]]  # Throw away unused columns
    grouped = hu.groupby(cfg.CITIZEN_ID, observed=True)
    df = pd.DataFrame(index=range(0, grouped.ngroups), columns=[cfg.CITIZEN_ID, 'md5', 'sha256', 'lod'])
92
93
94

    i = 0
    for key, vals in grouped:
95
96
        lod = str(list(vals[cfg.DEV_ISO_CLASS]))
        df.iloc[i][cfg.CITIZEN_ID] = key
97
98
99
100
101
        df.iloc[i]['md5'] = hashlib.md5(lod.encode()).hexdigest()
        df.iloc[i]['sha256'] = 0  # hashlib.sha256(lod.encode()).hexdigest()
        df.iloc[i]['lod'] = lod
        i += 1

102
    df.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, 'citizenid_md5_sha256.csv'))
103
104

    df2 = df.groupby(['md5', 'sha256', 'lod'], observed=True).count()
105
106
    df2 = df2.sort_values(by=cfg.CITIZEN_ID, ascending=False)
    df2.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, 'citizenid_md5_sha256_count.csv'))
107
108
109
110
111
112
113

def init_clusters():
    """
    Initialize cluster centroids based on the Hash-sequences

    :return: a tuple of (filtered hash sequences, hash sequences and their sizes)
    """
114
    df = pd.read_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, 'citizenid_md5_sha256_count.csv'))
115
116
117
118
    df = df.drop(['sha256'], axis=1)
    df["lod"] = df["lod"].apply(lambda x: ast.literal_eval(x))
    df = df.sort_values(by="lod")
    print("Unique LOD: ", df.shape)
119
    print("Number of Hash-sequences with size 1: ", len(df[df[cfg.CITIZEN_ID] == 1]))
120
121
122
123
124
125
126
127
128
129
130
131
132

    # create a boolean indicator for every cluster
    # https://stackoverflow.com/questions/29034928/pandas-convert-a-column-of-list-to-dummies
    mlb = MultiLabelBinarizer()
    lod = pd.DataFrame(mlb.fit_transform(df.lod), columns=mlb.classes_, index=df.index)
    print("Longest sequence of devices: {}. Shortest: {}".format(lod.sum(axis=1).max(), lod.sum(axis=1).min()))
    ################################################################################################
    ################# MERGE CLUSTERS with same devices but different order START ###################
    ################################################################################################

    #find the number of times a device appears in a list of devices
    device_sum = lod.sum(axis=0)

133
    # identify the devices that appears less than the threshold (quantile 15% ≈ 4 times)
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    thr = device_sum.quantile(0.15)
    device_sum_thr = device_sum >= thr
    iks_freq = device_sum_thr.index.where(device_sum_thr).dropna().values
    print("Number of devices appearing less than a threshold of {}: {}".
          format(thr, len(device_sum_thr)-len(iks_freq)))

    #Keep only the frequent iks
    lod = lod[iks_freq]

    # number of identical devices, but different order
    print("Unique clusters type E: {}. number of identical rows but different order: {}"
          .format(lod.shape, lod.duplicated().sum(axis=0)))

    # merge boolean clusters with group size
148
    lod[cfg.CITIZEN_ID] = df[cfg.CITIZEN_ID]
149
150

    # summing the amount of citizen ids in rows with same devices but different order
151
152
    groupby = lod.groupby(by=list(lod.columns.difference([cfg.CITIZEN_ID]).values), as_index=False)
    lod['Total'] = groupby[cfg.CITIZEN_ID].transform('sum')
153
154
155

    # drop the duplicates and keep the cluster with the most citizens
    # https://stackoverflow.com/questions/34420864/pandas-remove-duplicate-rows-except-the-one-with-highest-value-from-another-co
156
157
    lod = lod.sort_values(cfg.CITIZEN_ID)
    lod_pure = lod.drop_duplicates(subset=lod.columns.difference([cfg.CITIZEN_ID, 'Total']), keep='first')
158
159
160
161
162
163
164
165
166
167
168
169
    print("Unique LOD after removing duplicated devices: ", lod_pure.shape)

    ################################################################################################
    ################## MERGE CLUSTERS with same devices but different order END ####################
    ################################################################################################

    # create discrete device vector
    cls_guess = df.iloc[lod_pure.index]['lod']
    cls_guess = pd.DataFrame(cls_guess.tolist())
    cls_guess.index = lod_pure.index

    #Save to disk
170
171
    lod_pure.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, "citizenid_md5_sha256_count_filtered.csv"))
    cls_guess.to_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, "clustering_groups_hash_filtered_discrete.csv"))
172
173
174

    print("shape cluster guess: ", cls_guess.shape)

175
    return cls_guess, lod_pure