Commit 1ca3d0a0 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

updated api to include compliance in complete

parent 995cd38e
Pipeline #74017 passed with stage
in 3 minutes and 6 seconds
This diff is collapsed.
......@@ -12,7 +12,7 @@ tensorflow==2.5.0
openpyxl==3.0.6
xgboost==1.3.3
keras-tuner==1.0.2
shap==0.37.0
shap==0.39.0
fastapi==0.63.0
uvicorn==0.13.4
fastapi-jwt-auth==0.5.0
......
......@@ -16,7 +16,7 @@ from fastapi import Request, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
ATS_RESOLUTION = 50
ATS_RESOLUTION = 10
app = FastAPI(title='AIR API', version='0.1',
description='A simple API that classifies citizens based on data')
......@@ -86,8 +86,15 @@ class InputData(pydantic.BaseModel):
LoanPeriod: float
Ats: str
class OutputData(pydantic.BaseModel):
Probability: float
class CompleteOutputData(pydantic.BaseModel):
CompleteProb: float
ComplianceProb: float
ClusterId: int
ClusterValues: List
ShapValues: dict
class FallOutputData(pydantic.BaseModel):
FallProb: float
ClusterId: int
ClusterValues: List
ShapValues: dict
......@@ -157,47 +164,35 @@ def refresh(Authorize: AuthJWT = Depends()):
new_access_token = Authorize.create_access_token(subject=current_user)
return {"access_token": new_access_token}
@app.post('/predict_complete', response_model=OutputData, tags=["ai"])
@app.post('/predict_complete', response_model=CompleteOutputData, tags=["ai"])
def predict_complete(incoming_data: InputData):
data = incoming_data.dict()
df = prepare_data(data, 'complete')
model = read_joblib('complete_xgboost.joblib')
complete_model = read_joblib('complete_xgboost.joblib')
compliance_model = read_joblib('compliance_xgboost.joblib')
probability = model.predict_proba(df).flatten()[1]
cluster_id = int(df.iloc[0]['Cluster'])
clusters = read_dataframe('clusters.csv')
cluster_values = list(clusters.iloc[:, cluster_id])
shap_values = get_shap_values(model, X_test=df)
complete_prob = complete_model.predict_proba(df).flatten()[1]
if complete_prob > 0.5:
compliance_prob = compliance_model.predict_proba(df).flatten()[1]
else:
compliance_prob = 0
return {
'Probability': float(probability),
'ClusterId': int(df.iloc[0]['Cluster']),
'ClusterValues': cluster_values,
'ShapValues': shap_values
}
@app.post('/predict_compliance', response_model=OutputData, tags=["ai"])
def predict_compliance(incoming_data: InputData):
data = incoming_data.dict()
df = prepare_data(data, 'compliance')
model = read_joblib('compliance_xgboost.joblib')
probability = model.predict_proba(df).flatten()[1]
cluster_id = int(df.iloc[0]['Cluster'])
clusters = read_dataframe('clusters.csv')
cluster_values = list(clusters.iloc[:, cluster_id])
shap_values = get_shap_values(model, X_test=df)
shap_values = get_shap_values(complete_model, X_test=df)
compliance = 0 if compliance_prob < 0.5 else compliance_prob
return {
'Probability': float(probability),
'CompleteProb': float(complete_prob),
'ComplianceProb': float(compliance),
'ClusterId': int(df.iloc[0]['Cluster']),
'ClusterValues': cluster_values,
'ShapValues': shap_values
}
@app.post('/predict_fall', response_model=OutputData, tags=["ai"])
@app.post('/predict_fall', response_model=FallOutputData, tags=["ai"])
def predict_fall(incoming_data: InputData):
data = incoming_data.dict()
df = prepare_data(data, 'fall')
......@@ -211,7 +206,7 @@ def predict_fall(incoming_data: InputData):
shap_values = get_shap_values(model, X_test=df)
return {
'Probability': float(probability),
'FallProb': float(probability),
'ClusterId': int(df.iloc[0]['Cluster']),
'ClusterValues': cluster_values,
'ShapValues': shap_values
......@@ -225,9 +220,13 @@ def prepare_data(data: dict, case: str) -> pd.DataFrame:
df = split_categorical_columns(new_data_df, col='Ats', tag='Ats',
resolution=ATS_RESOLUTION)
df['Gender_Male'] = float(any(df['Gender'] == 1))
df['Gender_Female'] = float(any(df['Gender'] == 0))
df = df.drop(['Gender'], axis=1)
cols_ats = [str(i)+'Ats' for i in range(1, ATS_RESOLUTION+1)]
header_list = ['Gender', 'BirthYear', 'Cluster',
'LoanPeriod', 'NumberAts'] + cols_ats
header_list = ['Gender_Male', 'Gender_Female', 'BirthYear',
'Cluster', 'LoanPeriod', 'NumberAts'] + cols_ats
df = df.reindex(columns=header_list)
df = df.fillna('0')
......@@ -251,7 +250,7 @@ def get_shap_values(model, X_train=None, X_test=None) -> List:
return shap_values_dict
def get_cluster(ats):
model = read_joblib('km.joblib')
model = read_joblib('kmodes_model.joblib')
prediction = model.predict(ats)
return int(prediction[0])
......
......@@ -18,34 +18,30 @@ def main(dataset_version : str = 'emb'):
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_count.csv')
else:
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'complete_emb.csv')
model_dir = pt.COMPLETE_XGB_DIR
df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'complete_emb.csv')
target_name = "Complete"
elif case == "Compliance":
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_count.csv')
else:
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'compliance_emb.csv')
model_dir = pt.COMPLIANCE_XGB_DIR
df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'compliance_emb.csv')
target_name = "Compliance"
elif case == "Fall":
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_count.csv')
else:
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'fall_emb.csv')
model_dir = pt.FALL_XGB_DIR
df = file_reader.read_csv(pt.TESTS_FILES_DIR, 'fall_emb.csv')
target_name = "Fall"
else:
if dataset_version == 'ohe':
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_count.csv')
else:
df = file_reader.read_csv(pt.PROCESSED_DATA_DIR, 'risk_emb.csv')
model_dir = pt.RISK_XGB_DIR
target_name = "Risk"
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
X = df.drop([target_name], axis=1)
X = df.drop([target_name, 'Rand'], axis=1)
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
stratify=y, random_state=0)
......@@ -94,35 +90,13 @@ def main(dataset_version : str = 'emb'):
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]
file_writer.write_cm_plot(y_test, y_pred, pt.REPORTS_PLOTS_DIR,
f'{case.lower()}_xgb_cm.pdf', case)
file_writer.write_joblib(model, model_dir, f'{case.lower()}_xgboost.joblib')
file_writer.write_joblib(model, pt.MODELS_DIR, f'{case.lower()}_xgboost.joblib')
print(f"Scores for {case} XGBoost model:")
print(f"Accuracy: {np.around(accuracy_score(y_test, y_pred), decimals=3)}")
print(f"Precision: {np.around(precision_score(y_test, y_pred), decimals=3)}")
print(f"Recall: {np.around(recall_score(y_test, y_pred), decimals=3)}")
print(f"ROC AUC: {np.around(roc_auc_score(y_test, y_proba), decimals=3)}\n")
feature_names = X.columns
shap_explainer, shap_values = explainer.get_shap_tree_explainer(model, X_test=X_test)
importance_df = pd.DataFrame()
importance_df['feature'] = feature_names
importance_df['shap_values'] = np.around(abs(np.array(shap_values)[:,:]).mean(0), decimals=3)
importance_df['feat_imp'] = np.around(model.feature_importances_, decimals=3)
feat_importance_df_shap = importance_df.groupby('feature').mean().sort_values('shap_values',
ascending=False)
feat_importance_df_shap = feat_importance_df_shap.reset_index()
file_writer.write_csv(feat_importance_df_shap, pt.REPORTS_DIR, f"{case} best features.csv")
file_name_sum = f'{case.lower()}_shap_summary'
file_name_exp = f'{case.lower()}_shap_row_0'
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
file_name_sum, model_dir,
plot_type=None)
explainer.make_shap_summary_plot(shap_values, X_test, feature_names,
file_name_sum + '_bar', model_dir,
plot_type="bar")
if __name__ == '__main__':
main()
main()
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment