client.py 3.82 KB
Newer Older
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
1
2
3
4
5
6
"""
client.py
====================================
A command line application that can create various datasets.
"""

7
import argparse
8
import paths as pt
9
10
11
from src.data import parse_and_clean_data, make_screenings
from src.data import make_clusters, make_dataset_full
from src.data import make_dataset_count, make_dataset_emb
12
from src.model import make_xgb_models
13

14
15
16
17
18
19
20
def str2bool(v):
    if isinstance(v, bool):
       return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
21
    else:
22
        raise argparse.ArgumentTypeError('Boolean value expected.')
23
24
25

def parse_arguments():
    parser = argparse.ArgumentParser(description='A client for AIR')
26
27
28
29
30
31
    parser.add_argument('--dataset-year', type=str, default="2020",
                        choices=['2019', '2020'], help='string indicating dataset year')
    parser.add_argument('--dataset-version', type=str, default="emb",
                        choices=['emb', 'ohe'], help='string indicating dataset version')
    parser.add_argument("--enable-visualization", type=str2bool, nargs='?',
                        const=True, default=False,
32
                        help="boolean indicating if visualization should be enabled.")
33
34
    parser.add_argument("--use-real-ats-names", type=str2bool, nargs='?',
                        const=True, default=False,
35
36
37
38
39
                        help="boolean indicating if we should use real ats names.")
    parser.add_argument("--run-full-pipeline", type=str2bool, nargs='?',
                        const=True, default=True,
                        help="boolean indicating if we should run full pipeline. " +
                        "set to false to only make models")
40
41
42
43
    return parser.parse_args()

def main():
    parsed_args = parse_arguments()
44
45
46
47
    dataset_year = parsed_args.dataset_year
    dataset_version = parsed_args.dataset_version
    enable_visualization = parsed_args.enable_visualization
    use_real_ats_names = parsed_args.use_real_ats_names
48
    run_full_pipeline = parsed_args.run_full_pipeline
49
    print(f"Client started. Using this configuration:")
50
    print(f"Raw data dictionary: {pt.RAW_DATA_DIR_2020}")
51
52
53
    print(f"Dataset year: {dataset_year}")
    print(f"Dataset version: {dataset_version}")
    print(f"Visualization enabled: {enable_visualization}")
54
55
56
57
58
59
60
61
62
    print(f"Use real ATS names: {use_real_ats_names}")
    print(f"Run full pipeline: {run_full_pipeline}\n")
    
    if run_full_pipeline:
        print("Now parsing and cleaning data ...")        
        if dataset_year == '2019':
            parse_and_clean_data.main(year=dataset_year)
        else:
            parse_and_clean_data.main()
63
        
64
65
        print("Extracting screenings ...")
        make_screenings.main()
66
    
67
68
        print("Making clusters ...")
        make_clusters.main()
69
        print(f"Completed making cluster model. It can be found at: {pt.CLUSTERS_DIR}\n")
70
    
71
72
        print("Making full dataset ...")
        make_dataset_full.main(use_real_ats_names)
73
    
74
75
76
77
78
79
        if dataset_version == "emb":
            print("Making dataset with embedded ats ...")
            make_dataset_emb.main(enable_visualization)
        else:
            print("Making dataset with one-hot encoded ats ...")
            make_dataset_count.main()
80
            
81
        print("\nCompleted generating datasets at:")
82
83
        print(f"Interim data dictionary: {pt.INTERIM_DATA_DIR}")
        print(f"Processed data dictionary: {pt.PROCESSED_DATA_DIR}\n")
84
    
85
    print(f"Making 4 XGBoost models based on version: {dataset_version} ...\n")
86
87
    make_xgb_models.main(dataset_version)
    
88
    print(f"Completed making models. Models and SHAP plots can be found at:\n" +
89
          f"{pt.COMPLETE_XGB_DIR}\n" + f"{pt.COMPLIANCE_XGB_DIR}\n" +
90
          f"{pt.FALL_XGB_DIR}\n" + f"{pt.RISK_XGB_DIR}" + "\n")
91
92
93
    
if __name__ == "__main__":
    main()