make_clusters.py 1.76 KB
Newer Older
1
2
#!/usr/bin/env python

3
4
import numpy as np
import pandas as pd
5
6
import paths as pt
import yaml
7
8
from typing import List
from kmodes import kmodes
9
from pathlib import Path
10
from tools import file_reader, file_writer, preprocessor
11
12

def main():
13
14
15
    with open(Path.joinpath(pt.CONFIGS_DIR, "settings.yaml"), 'r') as stream:
        settings = yaml.safe_load(stream)
    df = file_reader.read_csv(pt.INTERIM_DATA_DIR, 'screenings.csv',
16
                              converters={'CitizenId': str})
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
17
    df = preprocessor.split_cat_columns(df, col_to_split='Ats', tag='Ats',
18
                                        resolution=settings['ats_resolution'])
19

20
    if settings['use_real_ats_names']:
21
        df = preprocessor.replace_cat_values(df)
22
        
23
    cols_ats = [str(i)+'Ats' for i in range(1, settings['ats_resolution']+1)]
24
25
26
    header_list = ['CitizenId'] + cols_ats
    df = df[header_list]
    
thecml's avatar
thecml committed
27
    model = kmodes.KModes(init='Huang', n_clusters=20, n_init=15, n_jobs=-1)
28
    model.fit(df.iloc[:, 1:].astype(str))
29
    
30
31
    predictions = model.predict(df.iloc[:, 1:].to_numpy())
    clusters = pd.Series(predictions, name="Cluster")
32

33
    ats_sequence = df[cols_ats].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
34
    clusters = pd.DataFrame({
35
        'CitizenId': df['CitizenId'],
36
        'Cluster': clusters,
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
37
        'Ats': ats_sequence
38
    })
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
39
40
    
    cluster_centroids = pd.DataFrame(dict([i for i in zip(range(0, len(model.cluster_centroids_)),
41
                                             model.cluster_centroids_)]))
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
42
    
43
44
45
    file_writer.write_joblib(model, pt.CLUSTERS_DIR, 'km.joblib')
    file_writer.write_csv(cluster_centroids, pt.INTERIM_DATA_DIR, f'cluster_centroids.csv')
    file_writer.write_csv(clusters, pt.INTERIM_DATA_DIR, 'cl.csv')
46

47
48
if __name__ == '__main__':
    main()