Commit 03b5c736 authored by Christian Marius Lillelund's avatar Christian Marius Lillelund
Browse files

updated docs, paths

parent f41eccb3
"""
client.py
====================================
A command line application that can create various datasets.
A command line application that can create various datasets for AIR.
"""
import argparse
import paths as pt
from src.data import parse_and_clean_data, make_screenings
from src.data import make_clusters, make_dataset_full
from src.data import make_dataset_count, make_dataset_emb
from src.model import make_xgb_models
from src.data import (load_and_clean_data,
make_screening_data,
make_alarm_data,
make_dataset_full,
make_dataset_count,
make_dataset_emb,
make_dataset_ohe,
make_dataset_ordinal)
def str2bool(v):
if isinstance(v, bool):
......@@ -22,72 +26,48 @@ def str2bool(v):
raise argparse.ArgumentTypeError('Boolean value expected.')
def parse_arguments():
parser = argparse.ArgumentParser(description='A client for AIR')
parser.add_argument('--dataset-year', type=str, default="2020",
choices=['2019', '2020'], help='string indicating dataset year')
parser = argparse.ArgumentParser(description='A command line application that' +
' can create various datasets for AIR.')
parser.add_argument('--dataset-version', type=str, default="emb",
choices=['emb', 'ohe'], help='string indicating dataset version')
parser.add_argument("--enable-visualization", type=str2bool, nargs='?',
const=True, default=False,
help="boolean indicating if visualization should be enabled.")
parser.add_argument("--use-real-ats-names", type=str2bool, nargs='?',
const=True, default=False,
help="boolean indicating if we should use real ats names.")
parser.add_argument("--run-full-pipeline", type=str2bool, nargs='?',
const=True, default=True,
help="boolean indicating if we should run full pipeline. " +
"set to false to only make models")
choices=['emb', 'ohe', 'count', 'ordinal'],
help='string indicating dataset version')
return parser.parse_args()
def main():
parsed_args = parse_arguments()
dataset_year = parsed_args.dataset_year
dataset_version = parsed_args.dataset_version
enable_visualization = parsed_args.enable_visualization
use_real_ats_names = parsed_args.use_real_ats_names
run_full_pipeline = parsed_args.run_full_pipeline
print(f"Client started. Using this configuration:")
print(f"Raw data dictionary: {pt.RAW_DATA_DIR_2020}")
print(f"Dataset year: {dataset_year}")
print(f"Raw data dictionary: {pt.RAW_DATA_DIR_2021}")
print(f"Dataset version: {dataset_version}")
print(f"Visualization enabled: {enable_visualization}")
print(f"Use real ATS names: {use_real_ats_names}")
print(f"Run full pipeline: {run_full_pipeline}\n")
if run_full_pipeline:
print("Now parsing and cleaning data ...")
if dataset_year == '2019':
parse_and_clean_data.main(year=dataset_year)
else:
parse_and_clean_data.main()
print("Extracting screenings ...")
make_screenings.main()
print("Making clusters ...")
make_clusters.main()
print(f"Completed making cluster model. It can be found at: {pt.CLUSTERS_DIR}\n")
print("Making full dataset ...")
make_dataset_full.main(use_real_ats_names)
if dataset_version == "emb":
print("Making dataset with embedded ats ...")
make_dataset_emb.main(enable_visualization)
else:
print("Making dataset with one-hot encoded ats ...")
make_dataset_count.main()
print("\nCompleted generating datasets at:")
print(f"Interim data dictionary: {pt.INTERIM_DATA_DIR}")
print(f"Processed data dictionary: {pt.PROCESSED_DATA_DIR}\n")
print("Now parsing and cleaning data ...")
load_and_clean_data.main()
print(f"Making 4 XGBoost models based on version: {dataset_version} ...\n")
make_xgb_models.main(dataset_version)
print("Making screenings ...")
make_screening_data.main()
print("Making full dataset ...")
make_dataset_full.main()
print(f"Completed making models. Models and SHAP plots can be found at:\n" +
f"{pt.COMPLETE_XGB_DIR}\n" + f"{pt.COMPLIANCE_XGB_DIR}\n" +
f"{pt.FALL_XGB_DIR}\n" + f"{pt.RISK_XGB_DIR}" + "\n")
print("Making alarm dataset ...")
make_alarm_data.main()
if dataset_version == "emb":
print("Making dataset with embedded ats ...")
make_dataset_emb.main()
elif dataset_version == "ohe":
print("Making dataset with one-hot-encoded ats ...")
make_dataset_ohe.main()
elif dataset_version == "count":
print("Making dataset with ats columns as counts ...")
make_dataset_count.main()
else:
print("Making dataset with ats columns as ordinal values ...")
make_dataset_ordinal.main()
print("\nCompleted generating datasets at:")
print(f"Interim data dictionary: {pt.INTERIM_DATA_DIR}")
print(f"Processed data dictionary: {pt.PROCESSED_DATA_DIR}\n")
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -7,7 +7,6 @@ pytest-runner==5.3.0
scikit-learn==0.24.1
Sphinx==3.5.2
notebook==6.2.0
streamlit==0.78.0
tensorflow==2.5.0
openpyxl==3.0.6
xgboost==1.3.3
......
......@@ -5,7 +5,7 @@ Script to evaluate the performance of using class weights.
"""
from tools import data_loader
from utility.settings import load_settings
from utility.config import load_config
import csv
import paths as pt
from pathlib import Path
......@@ -32,7 +32,7 @@ def main():
writer = csv.writer(f)
writer.writerow(header)
settings = load_settings(pt.CONFIGS_DIR, f"{case.lower()}.yaml")
settings = load_config(pt.CONFIGS_DIR, f"{case.lower()}.yaml")
if case == "Complete":
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings)
......
......@@ -13,7 +13,7 @@ from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
from pathlib import Path
import csv
from utility.settings import load_settings
from utility.config import load_config
from utility.metrics import compute_mean, compute_std
from io import BytesIO
import shutil
......@@ -107,7 +107,7 @@ def main():
metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'average_precision', 'f1']
cases = ["Compliance"]
for case in cases:
target_settings = load_settings(pt.CONFIGS_DIR, f'{case.lower()}.yaml')
target_settings = load_config(pt.CONFIGS_DIR, f'{case.lower()}.yaml')
output_filename = f"{case} model baseline.csv"
header = ['clf', 'version', 'accuracy_mean', 'accuracy_std',
'precision_mean', 'precision_std', 'recall_mean',
......
......@@ -18,10 +18,10 @@ from tools import data_loader
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
import matplotlib.pyplot as plt
from utility.settings import load_settings
from utility.config import load_config
def main():
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
settings = load_config(pt.CONFIGS_DIR, "fall.yaml")
protected_col_name = "Gender"
y_col_name="Fall"
......
......@@ -15,7 +15,7 @@ from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler, MaxAbsScaler, QuantileTransformer
from tools.classifiers import KnnClassifier, SvmClassifier, LrClassifier
from tools.classifiers import XgbClassifier, RfClassifier, MlpClassifier
from utility.settings import load_settings
from utility.config import load_config
from utility.metrics import compute_mean, compute_std
import csv
import paths as pt
......@@ -88,22 +88,22 @@ def main():
for normalizer_name, normalizer in zip(normalizer_names, normalizers):
for scaler_name, scaler in zip(scaler_names, scalers):
if case == "Complete":
settings = load_settings(pt.CONFIGS_DIR, "complete.yaml")
settings = load_config(pt.CONFIGS_DIR, "complete.yaml")
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Compliance":
settings = load_settings(pt.CONFIGS_DIR, "compliance.yaml")
settings = load_config(pt.CONFIGS_DIR, "compliance.yaml")
dl = data_loader.ComplianceDataLoader(pt.PROCESSED_DATA_DIR,
"compliance_emb.csv", settings).load_data()
X, y = dl.get_data()
elif case == "Fall":
settings = load_settings(pt.CONFIGS_DIR, "fall.yaml")
settings = load_config(pt.CONFIGS_DIR, "fall.yaml")
dl = data_loader.FallDataLoader(pt.PROCESSED_DATA_DIR,
"fall_emb.csv", settings).load_data()
X, y = dl.get_data()
else:
settings = load_settings(pt.CONFIGS_DIR, "risk.yaml")
settings = load_config(pt.CONFIGS_DIR, "risk.yaml")
dl = data_loader.RiskDataLoader(pt.PROCESSED_DATA_DIR,
"risk_emb.csv", settings).load_data()
X, y = dl.get_data()
......
......@@ -6,7 +6,7 @@ Script to make a baseline evaluation of the survival case using CV.
import paths as pt
from tools import data_loader
from utility.settings import load_settings
from utility.config import load_config
from utility.data import write_csv
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sklearn.model_selection import KFold
......@@ -17,7 +17,7 @@ import pandas as pd
import numpy as np
def main():
target_settings = load_settings(pt.CONFIGS_DIR, "alarm.yaml")
target_settings = load_config(pt.CONFIGS_DIR, "alarm.yaml")
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_emb.csv",
target_settings).load_data()
......
......@@ -6,7 +6,7 @@ Script to find the best ATS resolution for Complete case.
import paths as pt
from pathlib import Path
from utility.settings import load_settings
from utility.config import load_config
from tools import labeler, preprocessor
from utility.data import read_csv
from utility.embedder import encode_dataframe
......@@ -28,10 +28,10 @@ def main():
screenings = read_csv(file_path, file_name, converters=converters)
df = screenings.copy()
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
data_settings = load_config(pt.CONFIGS_DIR, 'data.yaml')
accum_screenings = labeler.accumulate_screenings(df, data_settings)
target_settings = load_settings(pt.CONFIGS_DIR, 'complete.yaml')
target_settings = load_config(pt.CONFIGS_DIR, 'complete.yaml')
features = target_settings['features']
for ats_res in range(idx, idx+step_size):
......
......@@ -14,7 +14,7 @@ import pandas as pd
from utility.metrics import gini_xgb
import shap
from typing import List
from utility.settings import load_settings
from utility.config import load_config
from io import BytesIO, StringIO
import shutil
from pathlib import Path
......@@ -24,7 +24,7 @@ NUM_ITERATIONS = 5
def main():
cases = ["Complete", "Compliance", "Fall", "Risk"]
for case in cases:
settings = load_settings(pt.CONFIGS_DIR, f"{case.lower()}.yaml")
settings = load_config(pt.CONFIGS_DIR, f"{case.lower()}.yaml")
if case == "Complete":
dl = data_loader.CompleteDataLoader(pt.PROCESSED_DATA_DIR,
"complete_emb.csv", settings)
......
......@@ -5,7 +5,7 @@ Script to make dataset for Alarm case.
"""
from tools import preprocessor
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_pickle, write_csv
import pandas as pd
import numpy as np
......@@ -16,7 +16,7 @@ def main():
file_name = 'ats.pkl'
df = read_pickle(file_path, file_name)
settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
settings = load_config(pt.CONFIGS_DIR, "data.yaml")
ats_iso_length = settings['ats_iso_length']
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:ats_iso_length]) # limit ats iso length
df = df[['CitizenId', 'BirthYear', 'Gender', 'LendDate', 'ReturnDate', 'DevISOClass']]
......
......@@ -9,18 +9,18 @@ values as a count of the number of times a value appears in a column.
import paths as pt
from tools import preprocessor
from utility import embedder
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_csv, write_csv
import pandas as pd
import numpy as np
def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk", "Alarm"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
data_settings = load_config(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
target_settings = load_config(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ex_resolution = target_settings['ex_resolution']
if label_name in ["Complete", "Compliance", "Fall", "Alarm"]:
......
......@@ -9,7 +9,7 @@ for the Alarm case.
"""
from utility.embedder import encode_dataframe
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_csv, write_csv
from pathlib import Path
import pandas as pd
......@@ -21,8 +21,8 @@ ENABLE_EMB_VIZ = False
def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk", "Alarm"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
data_settings = load_config(pt.CONFIGS_DIR, 'data.yaml')
target_settings = load_config(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
......
......@@ -7,7 +7,7 @@ case based on the screenings.
import paths as pt
from tools import labeler, preprocessor
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_csv, write_csv
import pandas as pd
......@@ -17,14 +17,14 @@ def main():
file_name = 'screenings.csv'
screenings = read_csv(file_path, file_name, converters=converters)
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
data_settings = load_config(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution']
df = screenings.copy()
accum_screenings = labeler.accumulate_screenings(df, data_settings)
for label_name in ['Complete', 'Compliance', 'Fall', 'Risk']:
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
target_settings = load_config(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
features = target_settings['features']
if label_name == "Risk":
......
......@@ -8,16 +8,16 @@ case using one hot encoding of categorial features.
import paths as pt
from tools import preprocessor
import pandas as pd
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_csv, write_csv
def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
data_settings = load_config(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
target_settings = load_config(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ex_resolution = target_settings['ex_resolution']
if label_name in ["Complete", "Compliance", "Fall"]:
......
......@@ -9,17 +9,17 @@ import paths as pt
import pandas as pd
import numpy as np
from pathlib import Path
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_csv, write_csv
from sklearn.preprocessing import OrdinalEncoder
def main():
for label_name in ["Complete", "Compliance", "Fall", "Risk"]:
data_settings = load_settings(pt.CONFIGS_DIR, 'data.yaml')
data_settings = load_config(pt.CONFIGS_DIR, 'data.yaml')
ats_resolution = data_settings['ats_resolution']
if label_name == "Risk":
target_settings = load_settings(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
target_settings = load_config(pt.CONFIGS_DIR, f'{label_name.lower()}.yaml')
ex_resolution = target_settings['ex_resolution']
if label_name in ["Complete", "Compliance", "Fall"]:
......
......@@ -8,7 +8,7 @@ import numpy as np
import pandas as pd
import paths as pt
from tools import inputter
from utility.settings import load_settings
from utility.config import load_config
from utility.data import read_pickle, write_csv, Data
from pandas.tseries.offsets import DateOffset
from typing import List, Tuple
......@@ -20,7 +20,7 @@ def main():
tc = read_pickle(pt.INTERIM_DATA_DIR, 'tc.pkl')
ats = read_pickle(pt.INTERIM_DATA_DIR, 'ats.pkl')
settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
settings = load_config(pt.CONFIGS_DIR, "data.yaml")
data = Data(sc, ss, td, tc, ats)
screenings = get_screenings(data, settings)
......
FROM python:3.8
WORKDIR /app
ADD . ./
RUN pip install --upgrade pip && pip install --no-cache-dir -r requirements.txt \
&& rm -rf requirements.txt
EXPOSE 80
RUN mkdir ~/.streamlit
RUN cp config.toml ~/.streamlit/config.toml
RUN cp credentials.toml ~/.streamlit/credentials.toml
WORKDIR /app
CMD ["streamlit", "run", "./who_completes.py"]
\ No newline at end of file
# Below are all the sections and options you can have in ~/.streamlit/config.toml.
[global]
# By default, Streamlit checks if the Python watchdog module is available and, if not, prints a warning asking for you to install it. The watchdog module is not required, but highly recommended. It improves Streamlit's ability to detect changes to files in your filesystem.
# If you'd like to turn off this warning, set this to True.
# Default: false
disableWatchdogWarning = false
# Configure the ability to share apps to the cloud.
# Should be set to one of these values: - "off" : turn off sharing. - "s3" : share to S3, based on the settings under the [s3] section of this config file.
# Default: "off"
sharingMode = "off"
# If True, will show a warning when you run a Streamlit-enabled script via "python my_script.py".
# Default: true
showWarningOnDirectExecution = true
[logger]
# Level of logging: 'error', 'warning', 'info', or 'debug'.
# Default: 'info'
level = "debug"
# String format for logging messages. If logger.datetimeFormat is set, logger messages will default to `%(asctime)s.%(msecs)03d %(message)s`. See [Python's documentation](https://docs.python.org/2.6/library/logging.html#formatter-objects) for available attributes.
# Default: None
messageFormat = "%(asctime)s %(levelname) -7s %(name)s: %(message)s"
[client]
# Whether to enable st.cache.
# Default: true
caching = true
# If false, makes your Streamlit script not draw to a Streamlit app.
# Default: true
displayEnabled = true
[runner]
# Allows you to type a variable or string by itself in a single line of Python code to write it to the app.
# Default: true
magicEnabled = true
# Install a Python tracer to allow you to stop or pause your script at any point and introspect it. As a side-effect, this slows down your script's execution.
# Default: false
installTracer = false
# Sets the MPLBACKEND environment variable to Agg inside Streamlit to prevent Python crashing.
# Default: true
fixMatplotlib = true
[server]
# List of folders that should not be watched for changes. Relative paths will be taken as relative to the current working directory.
# Example: ['/home/user1/env', 'relative/path/to/folder']
# Default: []
folderWatchBlacklist = ['']
# If false, will attempt to open a browser window on start.
# Default: false unless (1) we are on a Linux box where DISPLAY is unset, or (2) server.liveSave is set.
headless = true
# Immediately share the app in such a way that enables live monitoring, and post-run analysis.
# Default: false
liveSave = false
# Automatically rerun script when the file is modified on disk.
# Default: false
runOnSave = false
# The port where the server will listen for client and browser connections.
# Default: 8501
port = 80
# Enables support for Cross-Origin Request Sharing, for added security.
# Default: true
enableCORS = false
[browser]
# Internet address of the server server that the browser should connect to. Can be IP address or DNS name.
# Default: 'localhost'
serverAddress = "0.0.0.0"
# Whether to send usage statistics to Streamlit.
# Default: true
gatherUsageStats = true
# Port that the browser should use to connect to the server when in liveSave mode.
# Default: whatever value is set in server.port.
serverPort = 80
[s3]
# Name of the AWS S3 bucket to save apps.
# Default: (unset)
#bucket =
# URL root for external view of Streamlit apps.
# Default: (unset)
#url =
# Access key to write to the S3 bucket.
# Leave unset if you want to use an AWS profile.
# Default: (unset)
#accessKeyId =
# Secret access key to write to the S3 bucket.
# Leave unset if you want to use an AWS profile.
# Default: (unset)
#secretAccessKey =
# The "subdirectory" within the S3 bucket where to save apps.
# S3 calls paths "keys" which is why the keyPrefix is like a subdirectory. Use "" to mean the root directory.
# Default: ""
keyPrefix = ""
# AWS region where the bucket is located, e.g. "us-west-2".
# Default: (unset)
#region =
# AWS credentials profile to use.
# Leave unset to use your default profile.
# Default: (unset)
#profile =
\ No newline at end of file
[general]
email=""
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment