Skip to content
Snippets Groups Projects
Commit 203f8b2b authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

added raw markov chain code

parent 6f99c738
No related branches found
No related tags found
No related merge requests found
Pipeline #81291 passed
# %%
from pathlib import Path
import pandas as pd
import tensorflow as tf
from tools import file_reader
from sklearn.model_selection import train_test_split
#%%
# Get data
path = "C:/Users/Daniel/Documents/air/R_and_D_Daniel_Thomas"
raw_df = file_reader.read_pickle(Path(path+"/raw_data"), 'ats.pkl').reset_index(drop=True)
mapping = file_reader.read_csv(Path(path+"/raw_data"), 'ats.csv',
converters={'ats_id': str})
#%%
# GENERATE SEQUENCES OR USE NEXT CELL TO READ FROM PICKLE
# Shorten iso class
df = raw_df.copy()
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.dropna(subset=['CitizenId'])
mapping_dict = dict(mapping.values)
df = df.replace(to_replace=mapping_dict)
df.sort_values(by='LendDate', inplace=True)
cond = df['DevISOClass'].str.isdigit()
df.drop(df[cond].index, inplace=True)
# Remove citizens with only 1 record
counts = df.CitizenId.value_counts()
df = df[~df['CitizenId'].isin(counts[counts < 2].index)]
# Create list of sequence
combinedseq = []
for citizenId in df.CitizenId.unique():
temp = df.loc[df.CitizenId==citizenId]
seq = tf.keras.preprocessing.sequence.TimeseriesGenerator(temp.DevISOClass.values,
temp.DevISOClass.values,
length=1,
batch_size=1,
shuffle=False)
combinedseq.append(seq)
# Get data from list of sequence
flat_list = [item for sublist in combinedseq for item in sublist]
data = list(map(lambda row: (row[0][0][0], row[1][0]),flat_list))
# Unpack data and generate test and training sets
(x,y) = zip(*data)
# %%
# data from pickle
data_df = file_reader.read_pickle(Path(path+"/preprocessed_data_markov"), 'sequences.pkl').reset_index(drop=True)
x = data_df['x'].values
y = data_df['y'].values
#%%
# generate test and training sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
#%%
def getProbality(list, index, length):
try:
return round(list[index] / length,3)
except KeyError:
return 0
#%%
def generate_propbality_matrix(x,y) :
df_test = pd.DataFrame({'x': x, 'y': y})
prop_df = pd.DataFrame(columns=mapping['ats_name'].values)
for ats_name in mapping['ats_name'].values:
temp = df_test.loc[df_test['x']==ats_name]
array_to_append = []
for ats_name in mapping['ats_name'].values:
array_to_append.append(getProbality(temp['y'].value_counts(),ats_name, len(temp['y'])))
a_series = pd.Series(array_to_append, index = prop_df.columns)
prop_df = prop_df.append(a_series, ignore_index=True)
return prop_df
#%%
# Predict
def predict(ats_name, test_df, prop_df):
predicted_value = []
index = mapping.loc[mapping['ats_name'] == ats_name].index.values[0]
if (prop_df.iloc[index].max() > 0.0):
predicted_value = test_df.loc[index].values[0]
else:
predicted_value = ['']
return predicted_value
#%%
# Calculate accuracy
def calculate_accuracy(y_test,y_pred):
number_of_matches = 0
for (test_value, pred_values) in zip(y_test,y_pred) :
if(test_value in pred_values) :
number_of_matches += 1
return number_of_matches / len(y_test)
#%% Predict top k
def predict_top_k(k,prop_df):
y_pred = []
test_df = pd.DataFrame()
test_df['Max'] = prop_df.apply(lambda s: s.abs().nlargest(k).index.tolist(), axis=1)
for test in X_test:
y_pred.append(predict(test,test_df, prop_df))
return calculate_accuracy(y_test,y_pred)
#%%
df_prop = generate_propbality_matrix(X_train,y_train)
#%%
# predict for k in range 1-5
results = [predict_top_k(1, df_prop), predict_top_k(2,df_prop), predict_top_k(3,df_prop), predict_top_k(4,df_prop), predict_top_k(5,df_prop)]
accuracy_results = [round(num * 100,2) for num in results]
print('top 1: '+ str(accuracy_results[0]))
print('top 2: '+ str(accuracy_results[1]))
print('top 3: ' + str(accuracy_results[2]))
print('top 4: '+ str(accuracy_results[3]))
print('top 5: ' + str(accuracy_results[4]))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment