Commit d7691eb0 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

added db script, retrained models

parent 06de8271
No preview for this file type
No preview for this file type
......@@ -22,3 +22,4 @@ imbalanced-learn==0.8.0
seaborn==0.11.2
scikit-survival==0.16.0
eli5==0.11.0
pyodbc==4.0.32
\ No newline at end of file
#!/usr/bin/env python
import pandas as pd
import numpy as np
from pathlib import Path
import paths as pt
import pyodbc
from tools import preprocessor
from datetime import date, datetime, timedelta
import random
def main():
# Load data
df_home_care = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Hjemmepleje.csv"),
encoding="iso-8859-1",
skiprows=2)
df_ats = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Hjælpemidler.csv"),
encoding="iso-8859-1",
skiprows=2,
converters={'HMI nr': str, 'Kategori ISO nummer': str})
df_training = pd.read_csv(Path.joinpath(pt.RAW_DATA_DIR_TEST, "Træning.csv"),
encoding="iso-8859-1",
skiprows=2)
df_general = df_home_care.drop_duplicates(subset=["Borger Id"])[["Borger Id",
"Alder (aktuel)"]].reset_index(drop=True)
df_general['Køn'] = np.random.randint(0, 2, df_general.shape[0])
df_general['Køn'] = df_general['Køn'].apply(lambda x: "Mand" if x == 0 else "Kvinde")
def get_first_name(gender, first_name_number):
first_names_female = ["Jytte", "Oda", "Maren", "Karen", "Åse"]
first_names_male = ["Karl", "Svend", "Peder", "John", "Frank"]
if gender == "Kvinde":
return first_names_female[first_name_number]
else:
return first_names_male[first_name_number]
df_general['Fornavn'] = np.random.randint(0, 5, df_general.shape[0])
df_general['Fornavn'] = df_general.apply(lambda x: get_first_name(x['Køn'], x['Fornavn']), axis=1)
def get_last_name(last_name_number):
last_names = ["Jensen", "Hansen", "Pedersen", "Nielsen", "Larsen"]
return last_names[last_name_number]
df_general['Efternavn'] = np.random.randint(0, 5, df_general.shape[0])
df_general['Efternavn'] = df_general.apply(lambda x: get_last_name(x['Efternavn']), axis=1)
def get_ssn(age, gender):
birth_year = 2021-age
start_date = date(birth_year, 1, 1)
end_date = date(birth_year, 12, 31)
time_between_dates = end_date - start_date
days_between_dates = time_between_dates.days
random_number_of_days = random.randrange(days_between_dates)
random_date = start_date + timedelta(days=random_number_of_days)
random_date = str(datetime.strftime(random_date, '%d%m%y'))
if gender == "Kvinde":
last_four = str(random.randint(100, 999)) + str(4)
else:
last_four = str(random.randint(100, 999)) + str(3)
ssn = random_date + "-" + last_four
return ssn
df_general['CPR'] = df_general.apply(lambda x: get_ssn(x['Alder (aktuel)'], x['Køn']), axis=1)
# Insert general data
server = "tcp:air-db-server.database.windows.net,1433"
database = "air-db"
username = "airadmin"
password = "!airpassword123"
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = cnxn.cursor()
ids = list()
for index, row in df_general.iterrows():
cursor.execute("INSERT INTO dbo.citizen (first_name,last_name,ssn,age,gender) values(?,?,?,?,?)",
row['Fornavn'],
row['Efternavn'],
row['CPR'],
row['Alder (aktuel)'],
row['Køn'])
cursor.execute("SELECT @@IDENTITY AS ID;")
ids.append((cursor.fetchone()[0], row['Borger Id']))
break
cnxn.commit()
# Home care
for index, row in df_home_care.iterrows():
citizen_id = int(ids[index][0])
citizen_guid = ids[index][1]
citizen_rows = df_home_care.loc[df_home_care['Borger Id'] == citizen_guid]
counter = 0
for _, row in citizen_rows.iterrows():
cursor.execute("INSERT INTO dbo.home_care (citizen_id,date,type,name,time_slot) values(?,?,?,?,?)",
citizen_id,
row['Dato'],
row['Paragraf'],
row['Ydelse navn'],
row['Bestilt tid (minutter)'].replace(",","."))
counter = counter + 1
if counter > 4:
break
break
cnxn.commit()
# Training
for index, row in df_training.iterrows():
citizen_id = int(ids[index][0])
citizen_guid = ids[index][1]
citizen_rows = df_training.loc[df_training['Borger Id'] == citizen_guid]
counter = 0
for _, row in citizen_rows.iterrows():
cursor.execute("INSERT INTO dbo.training_plans (citizen_id,name,start_date,end_date,status,time_slot) values(?,?,?,?,?,?)",
citizen_id,
row['Ydelse navn'],
row['Forløbsdato start'],
row['Forløbsdato slut'],
row['Visiteret status'],
row['Bestilt tid (minutter)'])
counter = counter + 1
if counter > 4:
break
break
cnxn.commit()
# Ats
ats = pd.read_csv(Path.joinpath(pt.REFERENCES_DIR, 'ats.csv'), converters={'ats_id': str})
df_ats['Kategori ISO nummer'] = df_ats['Kategori ISO nummer'].apply(lambda x: x[:6])
df_ats['Kategori ISO navn'] = preprocessor.replace_cat_values(df_ats[['Kategori ISO nummer']], ats)
for index, row in df_ats.iterrows():
citizen_id = int(ids[index][0])
citizen_guid = ids[index][1]
citizen_rows = df_ats.loc[df_ats['Borger id'] == citizen_guid]
counter = 0
for _, row in citizen_rows.iterrows():
cursor.execute("INSERT INTO dbo.assistive_aids (citizen_id,name,iso,initiative_name,paragraph,lend_date) values(?,?,?,?,?,?)",
citizen_id,
row['Kategori ISO navn'],
row['Kategori ISO nummer'],
row['Indsats navn'],
row['Paragraf'],
row['Kørselsdato'])
counter = counter + 1
if counter > 4:
break
break
cnxn.commit()
cursor.close()
if __name__ == '__main__':
main()
\ No newline at end of file
# %%
from pathlib import Path
from tools import file_reader
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pickle
path = Path('C:/Users/Daniel/Documents/air/R_and_D_Daniel_Thomas') # Local path for data-files (ats.csv etc.)
#%%
# Get data
def load_date(path):
folder = 'raw_data'
raw_df = file_reader.read_pickle(path, f'{folder}/ats.pkl').reset_index(drop=True)
mapping = file_reader.read_csv(path, f'{folder}/ats.csv',
converters={f'{folder}/ats_id': str})
raw_df = raw_df.loc[raw_df['Gender'] != 'nan']
print('Loaded data')
return raw_df, mapping
# Shorten iso class
def shorten_iso(raw_df, mapping):
df = raw_df.copy()
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.dropna(subset=['CitizenId'])
mapping_dict = dict(mapping.values)
df = df.replace(to_replace=mapping_dict)
df.sort_values(by='LendDate', inplace=True)
cond = df['DevISOClass'].str.isdigit()
df.drop(df[~cond].index, inplace=True)
print('ISO-numbers shortened')
return df
# Remove citizens with only 1 record
def remove_single_records(df, seq_length):
counts = df.CitizenId.value_counts()
df = df[~df['CitizenId'].isin(counts[counts < seq_length + 1].index)]
print('Single records removed')
return df
# Create list of sequence
def create_sequence_list_LSTM(df, seq_length, column_name='DevISOClass'):
df = remove_single_records(df, seq_length)
X = []
y = []
print(f'shape: {df.shape}')
for citizenId in df.CitizenId.unique():
temp = df.loc[df.CitizenId==citizenId]
seq = tf.keras.preprocessing.sequence.TimeseriesGenerator(temp[column_name].values,
temp[column_name].values,
length=seq_length,
batch_size=1,
shuffle=False)
[((X.append(x[0][0].tolist())), y.append(x[1])) for x in seq]
return X, y
# Create list of sequence with zero-padding
def create_sequence_list_LSTM_pad(df, seq_length, column_name='DevISOClass'):
X = []
y = []
for temp_seq in range(seq_length):
temp_seq = temp_seq + 1
temp_df = remove_single_records(df, temp_seq)
for i, citizenId in enumerate(temp_df.CitizenId.unique()):
# Below if is to not create a sequence when it's the first sample
if i == 0:
continue
temp = temp_df.loc[temp_df.CitizenId==citizenId]
seq = tf.keras.preprocessing.sequence.TimeseriesGenerator(temp[column_name].values,
temp[column_name].values,
length=temp_seq,
batch_size=1,
shuffle=False)
for x in seq:
to_append = np.pad(x[0][0], (seq_length-temp_seq, 0), 'constant', constant_values=(0))
X.append(to_append)
y.append(x[1])
return X, y
def make_strings_categorical(string_list, mapping_to_numbers={}):
y_true = np.zeros((len(string_list)))
for i, raw_label in enumerate(string_list):
if raw_label not in mapping_to_numbers:
mapping_to_numbers[raw_label] = len(mapping_to_numbers)
y_true[i] = mapping_to_numbers[raw_label] +1
mapping_to_numbers['ZeroPadding'] = 0
return mapping_to_numbers
def convert_str_to_int(word, mapping):
return mapping[word]
def load_pickle(seq_length):
with open(f'preprocessed_data_LSTM/X_{seq_length}.pkl', 'rb') as f:
X = pickle.load(f)
with open(f'preprocessed_data_LSTM/y_{seq_length}.pkl', 'rb') as f:
y = pickle.load(f)
return X, y
#%%
# Initialize settings for run
LOAD_PREPROCESSED_DATA = False
PREPROCESSED_SEQUENCE_NAME = 10
ZERO_PAD_SEQUENCES = True
RUN_SMALLER_SAMPLE_SIZE = False
seq_length = 10
OVERWRITE_LOCAL_PREPROCESSED_SEQUENCES = False
#%%
raw_df, mapping = load_date(path)
if RUN_SMALLER_SAMPLE_SIZE:
raw_df = raw_df.head(50000)
counts = raw_df.DevISOClass.value_counts()
# raw_df = raw_df.loc[raw_df.DevISOClass.isin(counts.index[counts > seq_length])]
df = shorten_iso(raw_df, mapping)
#%%
string_to_int_mapping = make_strings_categorical(df.DevISOClass.values)
vocabulary_size = len(string_to_int_mapping)+1
df['DevISOClassCategorical'] = np.vectorize(convert_str_to_int)(df['DevISOClass'], string_to_int_mapping)
# Create sequences
# Load already processed sequences to save time
if LOAD_PREPROCESSED_DATA:
X, y = load_pickle(PREPROCESSED_SEQUENCE_NAME)
elif ZERO_PAD_SEQUENCES:
X, y = create_sequence_list_LSTM_pad(
df[['DevISOClassCategorical', 'CitizenId']],
seq_length,
column_name='DevISOClassCategorical')
else:
X, y = create_sequence_list_LSTM(
df,
seq_length,
column_name='DevISOClassCategorical')
# %%
# Store sequences locally in pickle-file
if OVERWRITE_LOCAL_PREPROCESSED_SEQUENCES:
with open(f'preprocessed_data_LSTM/X_{seq_length}.pkl', 'wb') as f:
pickle.dump(X, f)
with open(f'preprocessed_data_LSTM/y_{seq_length}.pkl', 'wb') as f:
pickle.dump(y, f)
# %%
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)
#%%
train_targets = to_categorical(y_train, num_classes=vocabulary_size)
test_targets = to_categorical(y_test, num_classes=vocabulary_size)
# %%
def topKacc_5(Y_true, Y_pred):
return top_k_categorical_accuracy(Y_true,
Y_pred,
k = 5)
def topKacc_4(Y_true, Y_pred):
return top_k_categorical_accuracy(Y_true,
Y_pred,
k = 4)
def topKacc_3(Y_true, Y_pred):
return top_k_categorical_accuracy(Y_true,
Y_pred,
k = 3)
def topKacc_2(Y_true, Y_pred):
return top_k_categorical_accuracy(Y_true,
Y_pred,
k = 2)
#%%
%load_ext tensorboard
# from tcn import TCN, tcn_full_summary
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout, CuDNNLSTM
from keras.metrics import top_k_categorical_accuracy
import datetime
filepath = './models/' + datetime.datetime.now().strftime("%Y%m%d-%H%M") + '/{epoch:02d}-{val_loss:.2f}.hdf5'
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=filepath, mode='min', monitor='val_loss', verbose=2, save_best_only=True)
es = EarlyStopping(monitor='val_loss', patience=5)
callbacks_list = [checkpoint, es, tensorboard_callback]
optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)
model = Sequential()
model.add(Embedding(vocabulary_size, 300, input_length=seq_length))
model.add(CuDNNLSTM(units=1024, input_shape=(1,seq_length)))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy', topKacc_2, topKacc_3, topKacc_4, topKacc_5])
model.summary()
num_epochs = 500
history = model.fit(
np.array(X_train),
train_targets,
validation_data=(np.array(X_test), test_targets),
epochs=num_epochs,
verbose=1,
use_multiprocessing=True,
callbacks=callbacks_list
)
#%%
%tensorboard --logdir logs/fit
# history.history['val_accuracy']
# What's in the project
2 notebooks:
- markov_chains.py -> Creates a model based on markov decision process
- LSTM_model.py -> Creates model based on LSTM neural network
Folders
- preprocessed_data_markov -> Contains pickle for loading already processed sequences for markov notebook
- preprocessed_data_LSTM -> Contains pickles for loading already processed sequences for LSTM notebook
- raw_data -> should contain raw data for project (ats.csv and ats.pkl).
- ## OBS! These are not included in zip-file and must be manually added due to data security
- models -> Contains already trained LSTM-models
- logs -> Stores training logs for LSTM-notebook
# Prerequisites to run notebooks
Create virtual environment based on python 3.8.x
Install requirements from requirements.txt $pip install -r -requirements.txt
Manually download data for training (ats.csv and ats.pkl)
Adjust path-variable in both notebooks to data-files
\ No newline at end of file
# %%
from pathlib import Path
import pandas as pd
import tensorflow as tf
from tools import file_reader
from sklearn.model_selection import train_test_split
#%%
# Get data
path = "C:/Users/Daniel/Documents/air/R_and_D_Daniel_Thomas"
raw_df = file_reader.read_pickle(Path(path+"/raw_data"), 'ats.pkl').reset_index(drop=True)
mapping = file_reader.read_csv(Path(path+"/raw_data"), 'ats.csv',
converters={'ats_id': str})
#%%
# GENERATE SEQUENCES OR USE NEXT CELL TO READ FROM PICKLE
# Shorten iso class
df = raw_df.copy()
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:6])
df = df.dropna(subset=['CitizenId'])
mapping_dict = dict(mapping.values)
df = df.replace(to_replace=mapping_dict)
df.sort_values(by='LendDate', inplace=True)
cond = df['DevISOClass'].str.isdigit()
df.drop(df[cond].index, inplace=True)
# Remove citizens with only 1 record
counts = df.CitizenId.value_counts()
df = df[~df['CitizenId'].isin(counts[counts < 2].index)]
# Create list of sequence
combinedseq = []
for citizenId in df.CitizenId.unique():
temp = df.loc[df.CitizenId==citizenId]
seq = tf.keras.preprocessing.sequence.TimeseriesGenerator(temp.DevISOClass.values,
temp.DevISOClass.values,
length=1,
batch_size=1,
shuffle=False)
combinedseq.append(seq)
# Get data from list of sequence
flat_list = [item for sublist in combinedseq for item in sublist]
data = list(map(lambda row: (row[0][0][0], row[1][0]),flat_list))
# Unpack data and generate test and training sets
(x,y) = zip(*data)
# %%
# data from pickle
data_df = file_reader.read_pickle(Path(path+"/preprocessed_data_markov"), 'sequences.pkl').reset_index(drop=True)
x = data_df['x'].values
y = data_df['y'].values
#%%
# generate test and training sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
#%%
def getProbality(list, index, length):
try:
return round(list[index] / length,3)
except KeyError:
return 0
#%%
def generate_propbality_matrix(x,y) :
df_test = pd.DataFrame({'x': x, 'y': y})
prop_df = pd.DataFrame(columns=mapping['ats_name'].values)
for ats_name in mapping['ats_name'].values:
temp = df_test.loc[df_test['x']==ats_name]
array_to_append = []
for ats_name in mapping['ats_name'].values:
array_to_append.append(getProbality(temp['y'].value_counts(),ats_name, len(temp['y'])))
a_series = pd.Series(array_to_append, index = prop_df.columns)
prop_df = prop_df.append(a_series, ignore_index=True)
return prop_df
#%%
# Predict
def predict(ats_name, test_df, prop_df):
predicted_value = []
index = mapping.loc[mapping['ats_name'] == ats_name].index.values[0]
if (prop_df.iloc[index].max() > 0.0):
predicted_value = test_df.loc[index].values[0]
else:
predicted_value = ['']
return predicted_value
#%%
# Calculate accuracy
def calculate_accuracy(y_test,y_pred):
number_of_matches = 0
for (test_value, pred_values) in zip(y_test,y_pred) :
if(test_value in pred_values) :
number_of_matches += 1
return number_of_matches / len(y_test)
#%% Predict top k
def predict_top_k(k,prop_df):
y_pred = []
test_df = pd.DataFrame()
test_df['Max'] = prop_df.apply(lambda s: s.abs().nlargest(k).index.tolist(), axis=1)
for test in X_test:
y_pred.append(predict(test,test_df, prop_df))
return calculate_accuracy(y_test,y_pred)
#%%
df_prop = generate_propbality_matrix(X_train,y_train)
#%%
# predict for k in range 1-5
results = [predict_top_k(1, df_prop), predict_top_k(2,df_prop), predict_top_k(3,df_prop), predict_top_k(4,df_prop), predict_top_k(5,df_prop)]
accuracy_results = [round(num * 100,2) for num in results]
print('top 1: '+ str(accuracy_results[0]))
print('top 2: '+ str(accuracy_results[1]))
print('top 3: ' + str(accuracy_results[2]))
print('top 4: '+ str(accuracy_results[3]))
print('top 5: ' + str(accuracy_results[4]))
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment