added raw markov chain code

203f8b2b · Christian Marius Lillelund · 6f99c738 · 203f8b2b
Commit 203f8b2b authored 3 years ago by Christian Marius Lillelund
--- a/ml/src/model/make_markov_chain_model.py
+++ b/ml/src/model/make_markov_chain_model.py
+# %%
+from pathlib import Path
+import pandas as pd
+import tensorflow as tf
+from tools import file_reader
+from sklearn.model_selection import train_test_split
+#%%
+# Get data
+path = "C:/Users/Daniel/Documents/air/R_and_D_Daniel_Thomas"
+raw_df = file_reader.read_pickle(Path(path+"/raw_data"), 'ats.pkl').reset_index(drop=True)
+mapping = file_reader.read_csv(Path(path+"/raw_data"), 'ats.csv',
+                               converters={'ats_id': str})
+#%%
+# GENERATE SEQUENCES OR USE NEXT CELL TO READ FROM PICKLE
+
+# Shorten iso class
+df = raw_df.copy() 
+df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
+
+df = df.dropna(subset=['CitizenId'])
+mapping_dict = dict(mapping.values)
+df = df.replace(to_replace=mapping_dict)
+df.sort_values(by='LendDate', inplace=True)
+cond = df['DevISOClass'].str.isdigit()
+df.drop(df[cond].index, inplace=True)
+
+# Remove citizens with only 1 record
+counts = df.CitizenId.value_counts()
+df = df[~df['CitizenId'].isin(counts[counts < 2].index)]
+
+# Create list of sequence
+combinedseq = []
+for citizenId in df.CitizenId.unique():
+    temp = df.loc[df.CitizenId==citizenId]
+    seq = tf.keras.preprocessing.sequence.TimeseriesGenerator(temp.DevISOClass.values, 
+                                                            temp.DevISOClass.values, 
+                                                            length=1, 
+                                                            batch_size=1,
+                                                            shuffle=False)
+    combinedseq.append(seq)
+
+# Get data from list of sequence
+flat_list = [item for sublist in combinedseq for item in sublist]
+data = list(map(lambda row: (row[0][0][0], row[1][0]),flat_list))
+
+# Unpack data and generate test and training sets
+(x,y) = zip(*data)
+# %%
+# data from pickle
+data_df = file_reader.read_pickle(Path(path+"/preprocessed_data_markov"), 'sequences.pkl').reset_index(drop=True)
+x = data_df['x'].values
+y = data_df['y'].values
+#%%
+# generate test and training sets
+X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
+#%%
+def getProbality(list, index, length):
+    try:
+        return round(list[index] / length,3)
+    except KeyError:
+        return 0
+#%%
+def generate_propbality_matrix(x,y) :
+    df_test = pd.DataFrame({'x': x, 'y': y})
+    prop_df = pd.DataFrame(columns=mapping['ats_name'].values)
+    for ats_name in mapping['ats_name'].values:
+        temp = df_test.loc[df_test['x']==ats_name]
+        array_to_append = []
+        for ats_name in mapping['ats_name'].values:
+            array_to_append.append(getProbality(temp['y'].value_counts(),ats_name, len(temp['y'])))
+        a_series = pd.Series(array_to_append, index = prop_df.columns)
+        prop_df = prop_df.append(a_series, ignore_index=True)
+
+    return prop_df
+#%% 
+# Predict
+def predict(ats_name, test_df, prop_df):
+    predicted_value = []
+    index = mapping.loc[mapping['ats_name'] == ats_name].index.values[0]
+    if (prop_df.iloc[index].max() > 0.0):
+        predicted_value = test_df.loc[index].values[0]
+    else:
+        predicted_value = ['']
+    return predicted_value
+
+#%%
+# Calculate accuracy 
+def calculate_accuracy(y_test,y_pred):
+    number_of_matches = 0
+
+    for (test_value, pred_values) in zip(y_test,y_pred) :
+        if(test_value in pred_values) : 
+            number_of_matches += 1
+    
+    return number_of_matches / len(y_test)
+#%% Predict top k 
+def predict_top_k(k,prop_df):
+    y_pred = []
+    test_df =  pd.DataFrame()
+    test_df['Max'] = prop_df.apply(lambda s: s.abs().nlargest(k).index.tolist(), axis=1)
+    for test in X_test:
+        y_pred.append(predict(test,test_df, prop_df))
+
+    return calculate_accuracy(y_test,y_pred)
+
+#%%
+df_prop = generate_propbality_matrix(X_train,y_train)
+#%%
+# predict for k in range 1-5
+results = [predict_top_k(1, df_prop), predict_top_k(2,df_prop), predict_top_k(3,df_prop), predict_top_k(4,df_prop), predict_top_k(5,df_prop)]
+accuracy_results  = [round(num * 100,2) for num in results]
+print('top 1: '+ str(accuracy_results[0]))
+print('top 2: '+  str(accuracy_results[1]))
+print('top 3: ' +  str(accuracy_results[2]))
+print('top 4: '+  str(accuracy_results[3]))
+print('top 5: ' +  str(accuracy_results[4]))
\ No newline at end of file