Commit ae55edaa authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

extended api, added shap values

parent 291b7840
Pipeline #47260 failed with stage
in 34 seconds
......@@ -21,6 +21,8 @@ sdist/
var/
reports/
models/
src/api/embeddings
src/api/models
*.egg-info/
.installed.cfg
*.egg
......
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import datetime as dt
from tools import preprocessor, data_loader
import config as cfg
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
tf.get_logger().setLevel('ERROR')
# Set dataset
filename = "complete.csv"
converters = {str(i)+'Ats':str for i in range(1, cfg.ATS_RESOLUTION+1)}
# Create data loader, prepare data
dl = data_loader.CompleteDataLoader(file_name=filename, converters=converters).load_data()
X, y = dl.get_data()
df = pd.concat([X, y], axis=1)
# Add age feature
df['Age'] = df['BirthYear'].apply(lambda x: 121-x)
```
%% Cell type:code id: tags:
``` python
df.head()
```
%% Output
Gender BirthYear Cluster LoanPeriod NumberAts 1Ats 2Ats 3Ats \
0 0 31 19 1839.0 12 091218 093307 091203
1 0 35 13 987.0 9 091218 181210 043306
2 0 33 18 2696.0 8 222704 222403 180903
3 0 33 18 2756.0 8 222704 222403 180903
4 0 46 20 1700.0 40 122303 122303 120606
4Ats 5Ats ... 43Ats 44Ats 45Ats 46Ats 47Ats 48Ats 49Ats 50Ats \
0 091233 120606 ... 0 0 0 0 0 0 0 0
1 123621 120612 ... 0 0 0 0 0 0 0 0
2 120606 093307 ... 0 0 0 0 0 0 0 0
3 120606 093307 ... 0 0 0 0 0 0 0 0
4 120306 093307 ... 0 0 0 0 0 0 0 0
Complete Age
0 0 90
1 0 86
2 1 88
3 1 88
4 1 75
[5 rows x 57 columns]
Gender BirthYear Cluster LoanPeriod NumberAts 1Ats 2Ats 3Ats \\n0 0 21 18 516.0 15 093307 091212 120612 \n1 0 45 12 315.0 2 120606 093307 0 \n2 0 45 13 141.0 8 120606 093307 183015 \n3 0 45 13 142.0 8 120606 093307 183015 \n4 0 45 13 159.0 9 120606 093307 183015 \n\n 4Ats 5Ats ... 43Ats 44Ats 45Ats 46Ats 47Ats 48Ats 49Ats 50Ats \\n0 120612 222718 ... 0 0 0 0 0 0 0 0 \n1 0 0 ... 0 0 0 0 0 0 0 0 \n2 123103 120612 ... 0 0 0 0 0 0 0 0 \n3 123103 120612 ... 0 0 0 0 0 0 0 0 \n4 123103 120612 ... 0 0 0 0 0 0 0 0 \n\n Complete Age \n0 0 100 \n1 1 76 \n2 1 76 \n3 1 76 \n4 1 76 \n\n[5 rows x 57 columns]
%% Cell type:code id: tags:
``` python
df.Complete.value_counts()
```
%% Output
1 1465
0 576
Name: Complete, dtype: int64
1 1543\n0 601\nName: Complete, dtype: int64
%% Cell type:code id: tags:
``` python
import seaborn as sns
var = df['Complete']
varValue = var.value_counts()
plt.figure()
plt.bar(varValue.index, varValue)
plt.xticks(varValue.index, varValue.index.values)
plt.ylabel("Frequency")
plt.title('Complete')
file_name = f"Complete bar.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
%% Output
%% Cell type:code id: tags:
``` python
plot = sns.scatterplot(data=df, x="Age", y="NumberAts", hue="Complete")
plt.title("Scatter plot of NumberAts vs Age")
fig = plot.get_figure()
file_name = f"Complete scatter NumberAts Age.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
%% Output
%% Cell type:code id: tags:
``` python
g = sns.FacetGrid(df, col="Complete")
g.map(sns.distplot, "Age", bins=25)
g.fig.suptitle("Number of citizens who complete given age")
g.fig.subplots_adjust(top=.8)
file_name = f"Complete facetgrid age.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches="tight")
```
%% Output
%% Cell type:code id: tags:
``` python
def get_ats_list(df):
all_ats = []
ats_cols = [f"{i}Ats" for i in range(1, cfg.ATS_RESOLUTION+1)]
for ats_col in ats_cols:
for ats_string in df[ats_col]:
for ats in ats_string.split(","):
if ats != "0":
all_ats.append(ats)
return all_ats
ats_no_complete = pd.Series(get_ats_list(df.loc[df['Complete'] == 0]))
ats_complete = pd.Series(get_ats_list(df.loc[df['Complete'] == 1]))
a = pd.DataFrame(ats_no_complete.value_counts()[:20], columns=['No complete quantity'])
b = pd.DataFrame(ats_complete.value_counts()[:20], columns=['Complete quantity'])
ats_df = pd.concat([a, b], axis=1).fillna(0)
ats_df.index.names = ['Ats']
ats_df = ats_df.reset_index()
ats_df['No complete quantity'] = ats_df['No complete quantity'] / len(ats_no_complete)
ats_df['Complete quantity'] = ats_df['Complete quantity'] / len(ats_complete)
```
%% Cell type:code id: tags:
``` python
plt.bar(ats_df["Ats"], ats_df["No complete quantity"], label="No complete")
plt.bar(ats_df["Ats"], ats_df["Complete quantity"], bottom=ats_df["No complete quantity"], label="Complete")
plt.legend()
plt.xticks(rotation=90)
plt.ylabel("Scaled ats usage")
plt.title('Scaled plot of ats usage for complete')
file_name = f"Complete scaled ats usage.pdf"
plt.savefig(Path.joinpath(cfg.REPORTS_PLOTS_DIR, file_name), dpi=300, bbox_inches = "tight")
```
%% Output
......
from pandas_profiling import ProfileReport
from tools import file_reader, file_writer
import config as cfg
def main():
ex = {str(i)+'Ex':str for i in range(1,10)}
ats = {str(i)+'Ats':str for i in range(1,11)}
complete = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'complete.csv',
converters={**ex, **ats})
fall = file_reader.read_csv(cfg.PROCESSED_DATA_DIR,
'fall.csv',
converters=ats)
profile_completed = ProfileReport(complete)
profile_fall = ProfileReport(fall)
file_writer.write_report(profile_completed, cfg.REPORTS_DIR, 'profile_complete.html')
file_writer.write_report(profile_fall, cfg.REPORTS_DIR, 'profile_fall.html')
if __name__ == "__main__":
main()
\ No newline at end of file
#!/usr/bin/env python
import utility.cluster as cluster_ut
import utility.dataset as dataset_ut
import config as cfg
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
from tools import file_reader, file_writer, preprocessor, feature_maker
from utility import cluster
from kmodes.kmodes import KModes
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
def main():
df = file_reader.read_csv(cfg.INTERIM_DATA_DIR, 'screenings.csv',
converters={'CitizenId': str})
df = preprocessor.split_categorical_columns(df,
col='Ats',
tag='Ats',
resolution=cfg.ATS_RESOLUTION)
df = feature_maker.make_complete_feature(df)
general_cols = df[['CitizenId', 'Gender', 'BirthYear', 'LoanPeriod']]
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
df = pd.concat([general_cols, emb_cols, df[['Complete']]], axis=1)
class ClusterMaker(BaseEstimator, TransformerMixin):
def __init__(self, init='random', n_clusters=1, n_init=1):
self.init = init
self.n_clusters = n_clusters
self.n_init = n_init
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
model = KModes(init=self.init, n_clusters=self.n_clusters,
n_init=self.n_init,
cat_dissim=cluster.calculate_disimmilarity,
n_jobs=-1)
model.fit(X.iloc[:, 4:].astype(str))
predictions = model.predict(X.iloc[:, 4:].astype(str))
predictions = pd.Series(predictions, name="Cluster")
X = X.iloc[:, 1:4]
X = X.reset_index(drop=True)
X['Cluster'] = predictions
return X
pipeline = Pipeline([
('cluster_maker', ClusterMaker()),
('clf', RandomForestClassifier(random_state=0))
])
param_grid = [
{
'cluster_maker__init': ['random', 'Huang', 'Cao'],
'cluster_maker__n_clusters': [2, 5, 10, 20, 25, 30,
35, 40, 45, 50, 100],
'cluster_maker__n_init': [1, 5, 10, 20]
}
]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid,
n_iter=1, scoring='neg_log_loss',
cv=skf, n_jobs=-1, verbose=3,
random_state=0)
X = df[['CitizenId', 'Gender', 'BirthYear', 'LoanPeriod'] + list(emb_cols)]
y = df['Complete']
random_search.fit(X, y)
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (5, 5))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
file_writer.write_csv(results, cfg.REPORTS_DIR, 'kmodes-settings-random-grid-search-results.csv')
if __name__ == '__main__':
main()
from typing import List
import fastapi
from numpy.lib.shape_base import column_stack
import uvicorn
import pydantic
import config as cfg
import pandas as pd
import numpy as np
import api.utility as utility
from typing import List
from numpy.lib.shape_base import column_stack
from pathlib import Path
from tools import file_reader, preprocessor
app = fastapi.FastAPI(title='AIR API', version='1.0',
description='A simple model that classifies citizens based on data')
ATS_RESOLUTION = 50
app = fastapi.FastAPI(title='AIR API', version='0.1',
description='A simple API that classifies citizens based on data')
class Features(pydantic.BaseModel):
Gender: int
......@@ -20,15 +20,34 @@ class Features(pydantic.BaseModel):
@app.get('/')
def index():
return {'message': 'AIR API v. 1.0'}
return {'message': 'AIR API v. 0.1'}
@app.post('/predict_complete')
def predict_complete(incoming_data: Features):
data = incoming_data.dict()
df = prepare_data(data, 'complete')
model = utility.read_joblib('complete_xgboost.joblib')
prediction = model.predict(df)
probability = model.predict_proba(df).max()
_, shap_values = utility.get_shap_values(model, X_test=df)
shap_values_flat = [round(float(val), 3) for val in shap_values[0]]
shap_values_dict = dict(zip(df.columns, shap_values_flat))
return {
'prediction': int(prediction[0]),
'probability': float(probability),
'shap_values': shap_values_dict,
}
@app.post('/predict')
def get_prediction(incoming_data: Features):
@app.post('/predict_success')
def predict_success(incoming_data: Features):
data = incoming_data.dict()
df = prepare_data(data)
df = prepare_data(data, 'success')
model = file_reader.read_joblib(cfg.COMPLETE_XGB_DIR,
'complete_xgboost.joblib')
model = utility.read_joblib('success_xgboost.joblib')
prediction = model.predict(df)
probability = model.predict_proba(df).max()
......@@ -38,28 +57,52 @@ def get_prediction(incoming_data: Features):
'probability': float(probability)
}
def prepare_data(data):
@app.post('/predict_fall')
def predict_fall(incoming_data: Features):
data = incoming_data.dict()
df = prepare_data(data, 'fall')
model = utility.read_joblib('fall_xgboost.joblib')
prediction = model.predict(df)
probability = model.predict_proba(df).max()
return {
'prediction': int(prediction[0]),
'probability': float(probability)
}
def prepare_data(data: dict, case: str) -> pd.DataFrame:
new_data = {k: [v] for k, v in data.items()}
new_data_df = pd.DataFrame.from_dict(new_data)
new_data_df['NumberAts'] = len(new_data_df['Ats'][0].split(","))
df = preprocessor.split_categorical_columns(new_data_df,
col='Ats',
tag='Ats',
resolution=cfg.ATS_RESOLUTION)
cols_ats = [str(i)+'Ats' for i in range(1, cfg.ATS_RESOLUTION+1)]
header_list = ['Gender', 'BirthYear', 'Cluster', 'LoanPeriod', 'NumberAts'] + cols_ats
df = utility.split_categorical_columns(new_data_df, col='Ats', tag='Ats',
resolution=ATS_RESOLUTION)
cols_ats = [str(i)+'Ats' for i in range(1, ATS_RESOLUTION+1)]
header_list = ['Gender', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberAts'] + cols_ats
df = df.reindex(columns=header_list)
df = df.fillna('0')
path = Path.joinpath(cfg.PROCESSED_DATA_DIR, 'complete_embeddings')
for i in range(1, cfg.ATS_RESOLUTION+1):
embedding = file_reader.read_embedding(path, f'{i}Ats.csv')
citizen_ats = df.filter(regex='((\d+)[Ats])\w+', axis=1).values
new_data_df['Cluster'] = get_cluster(citizen_ats)
new_data_df['Cluster'] = pd.to_numeric(new_data_df['Cluster'])
for i in range(1, ATS_RESOLUTION+1):
embedding = utility.read_embedding(f'{case}_{i}Ats.csv')
column = f'{i}Ats'
df[column] = df[column].replace(to_replace=embedding)
df[column] = pd.to_numeric(df[column])
df['Cluster'] = pd.to_numeric(df['Cluster'])
return df
def get_cluster(ats):
model = utility.read_pickle('km.pickle')
prediction = model.predict(ats)
return int(prediction[0])
if __name__ == "__main__":
uvicorn.run(app, port=8000, host="0.0.0.0")
\ No newline at end of file