parser.py 7.85 KB
Newer Older
1
2
3
4
5
6
7
8
9
#!/usr/bin/env python

"""
Authors: Cecilie Moriat, Tenna Rasmussen, Christian Fischer Pedersen

Date: 20th March, 2020
"""

# Internal
10
import src.data.file_reader as file_reader
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
11
import src.data.file_writer as file_writer
12
import src.config as cfg
13
14
15
16

# External
import pandas as pd
import numpy as np
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
17
import datetime
18
from pathlib import Path
19

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
20
def parse_training_table(file_name, path, parsed_file_name):
21
    """
22
    Parse the DigiRehab Training Done and Training Cancelled tabels and return it as a pandas dataframe
23
24
25
26

    :rtype: pandas.Dataframe
    :return: a pandas dataframe
    """
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
27
    df = file_reader.read_excelfile(path, file_name)
28

29
    training_columns = [cfg.PATIENT_ID, cfg.RATING_DATE, cfg.RATING_SCORE]
30
31
    training_df = format_single_line_data(df, training_columns, "Patient Id")
    
32
    # Format data set
33
    training_df.replace(to_replace='', value=np.nan, regex=True, inplace=True)
34
35
36
    training_df[cfg.PATIENT_ID] = pd.Series.astype(training_df[cfg.PATIENT_ID], dtype=str)
    training_df[cfg.RATING_DATE] = pd.to_datetime(training_df[cfg.RATING_DATE], format='%d/%m/%Y')
    training_df[cfg.RATING_SCORE] = pd.Series.astype(training_df[cfg.RATING_SCORE], dtype='float')
37

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
38
39
40
    #TODO: Enable saving
    #file_name = str(parsed_file_name + ".csv")
    #file_writer.write_csv(training_df, INTERIM_DATA_DIR, file_name, '%d-%m-%Y', False)
41

42
    return training_df
43

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
44
def parse_status_set(file_name, path, parsed_file_name):
45
46
47
48
49
50
    """
    Parse the DigiRehab Status Set tabels and return it as a pandas dataframe

    :rtype: pandas.Dataframe
    :return: a pandas dataframe
    """
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
51
    df = file_reader.read_excelfile(path, file_name)
52

53
    ss_columns = [cfg.PATIENT_ID, cfg.CHANGE_DATE, cfg.STATUS]
54

55
    status_set_df = format_single_line_data(df, ss_columns, "Patient Id")
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
56

57
    # Format data set
58
    status_set_df.replace(to_replace='', value=np.nan, regex=True, inplace=True)
59
60
61
    status_set_df[cfg.PATIENT_ID] = pd.Series.astype(status_set_df[cfg.PATIENT_ID], dtype=str)
    status_set_df[cfg.CHANGE_DATE] = pd.to_datetime(status_set_df[cfg.CHANGE_DATE], format='%d/%m/%Y')
    status_set_df[cfg.STATUS] = status_set_df[cfg.STATUS].astype('category')
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
62

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
63
64
65
    #TODO: Enable saving
    #file_name = str(parsed_file_name + ".csv")
    #file_writer.write_csv(status_set_df, INTERIM_DATA_DIR, file_name, '%d-%m-%Y', False)
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
66

67
    return status_set_df
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
68
69


Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
70
def parse_screening_values(file_name, path, parsed_file_name):
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
71
72
73
74
75
76
    """
    Parse the DigiRehab Screening values tabels and return it as a pandas dataframe

    :rtype: pandas.Dataframe
    :return: a pandas dataframe
    """
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
77
    df = file_reader.read_excelfile(path, file_name)
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
78

79
80
81
82
83
84
85
    sv_columns = [cfg.PATIENT_ID,
                  cfg.SCREENING_DATE,
                  cfg.NEED_FOR_HELP_SCORE,
                  cfg.NEED_FOR_HELP_REASON,
                  cfg.PHYSICAL_STRENGTH_SCORE,
                  cfg.PHYSICAL_STRENGTH_REASON,
                  cfg.EXERCISE_CONTENT]
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
86

87
88
    # iterate through and parse the data
    sv = format_multiple_line_data(df, sv_columns, "Patient Id")
89
90

    # formatting dataset
91
    sv.replace(to_replace='', value=np.nan, regex=True, inplace=True)
92
93
94
95
96
97
98
    sv[cfg.PATIENT_ID] = pd.Series.astype(sv[cfg.PATIENT_ID], dtype=str)
    sv[cfg.SCREENING_DATE] = pd.to_datetime(sv[cfg.SCREENING_DATE], format='%d-%m-%Y')
    sv[cfg.NEED_FOR_HELP_SCORE] = pd.Series.astype(sv[cfg.NEED_FOR_HELP_SCORE], dtype='float')
    sv[cfg.NEED_FOR_HELP_REASON] = sv[cfg.NEED_FOR_HELP_REASON].astype('category')
    sv[cfg.PHYSICAL_STRENGTH_SCORE] = pd.Series.astype(sv[cfg.PHYSICAL_STRENGTH_SCORE], dtype='float')
    sv[cfg.PHYSICAL_STRENGTH_REASON] = sv[cfg.PHYSICAL_STRENGTH_REASON].astype('category')
    sv[cfg.EXERCISE_CONTENT] = sv[cfg.EXERCISE_CONTENT].astype('category')
99

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
100
101
102
    #TODO: Enable saving
    #file_name = str(parsed_file_name + ".csv")
    #file_writer.write_csv(sv, INTERIM_DATA_DIR, file_name, '%d-%m-%Y', False)
103

104
    return sv
105

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
106
def parse_assistive_aids(file_name, path, parsed_file_name):
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
107
    """
108
    Parse the KMD assistive aids tables and return it as a pandas dataframe
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
109
110
111
112

    :rtype: pandas.Dataframe
    :return: a pandas dataframe
    """
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
113
    hu_file = Path.joinpath(path, file_name)
114

115
116
117
118
119
120
121
122
123
    hu_columns = [cfg.CITIZEN_ID,
                  cfg.DEV_HMI_NUMBER,
                  cfg.DEV_HMI_NAME,
                  cfg.DEV_ISO_CLASS,
                  cfg.DEV_SERIAL,
                  cfg.LAW_PARAGRAPH,
                  cfg.LEND_DATE,
                  cfg.RETURN_DATE,
                  cfg.PRICE]
124
125

    date_parser = lambda c: pd.to_datetime(c, format='%Y/%m/%d %H:%M:%S', errors='coerce')
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
126
127

    df = pd.read_csv(hu_file,
128
129
130
131
                     header=1,
                     names=hu_columns,
                     converters={i: str for i in range(0, 10000)},
                     parse_dates=[6, 7],
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
132
                     date_parser=date_parser)
133
                     
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
134
135
136
    #TODO: Enable saving
    #file_name = str(parsed_file_name + ".csv")
    #file_writer.write_csv(df, INTERIM_DATA_DIR, file_name, '%d-%m-%Y', False)
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
137

138
    return df
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
139
140


Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
141
def parse_iso_classes(file_name, path, parsed_file_name):
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
142
143
144
145
146
147
    """
    Parse the KMD ISO class tabels and return it as a pandas dataframe

    :rtype: pandas.Dataframe
    :return: a pandas dataframe
    """
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
148
    isoclass_file = Path.joinpath(path, file_name)
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
149
150
151
    df = pd.read_csv(isoclass_file,
                     header=None,
                     usecols=[0, 1, 2],
152
                     names=[cfg.DEV_ISO_CLASS, cfg.GROUP_SIZE, cfg.DESCRIPTION],
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
153
                     converters={i: str for i in range(0, 10000)})
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
154

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
155
156
157
    #TODO: Enable saving
    #file_name = str(parsed_file_name + ".csv")
    #file_writer.write_csv(df, INTERIM_DATA_DIR, file_name)
158
159

    return df
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
160
161


Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
162
def parse_patient_data(file_name, path, parsed_file_name):
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
163
    """
164
    Parse the DigiRehab BorgerIDALL.csv and return the patient data as a pandas dataframe
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
165
166
167
168

    :rtype: pandas.Dataframe
    :return: a pandas dataframe
    """
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
169
    citizen_file = Path.joinpath(path, file_name)
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
170

171
172
173
174
    patient_data_columns = [cfg.PATIENT_ID,
                            cfg.CITIZEN_ID,
                            cfg.SEX,
                            cfg.BIRTH_YEAR]
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
175

176
177
178
    df = pd.read_csv(citizen_file,
                     header=0,
                     usecols=[0, 1, 2, 3],
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
179
                     sep=';')
thecml's avatar
thecml committed
180

181
182
    df.dropna(subset=['Gender'], inplace=True)  # remove rows with incomplete data
    df.columns = patient_data_columns
183
184
185
186
    df[cfg.PATIENT_ID] = pd.Series.astype(df[cfg.PATIENT_ID], dtype=str)
    df[cfg.CITIZEN_ID] = pd.Series.astype(df[cfg.CITIZEN_ID], dtype=str)
    df[cfg.SEX] = df[cfg.SEX].astype('category')
    df[cfg.BIRTH_YEAR] = df[cfg.BIRTH_YEAR].astype('float')
187

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
188
189
190
    #TODO: Enable saving
    #file_name = str(parsed_file_name + ".csv")
    #file_writer.write_csv(df, INTERIM_DATA_DIR, file_name, False)
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
191

192
    return df
Cecilie Østergaard Moriat's avatar
Cecilie Østergaard Moriat committed
193

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
194
def parse_clusters(file_name):
195
196
197
    clusters = pd.read_csv(Path.joinpath(cfg.INTERIM_DATA_DIR, file_name),
        names=[cfg.CITIZEN_ID, 'Cluster'],
        dtype = {cfg.CITIZEN_ID: str})
198
199
    return clusters
    
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def format_single_line_data(data, data_columns, index_col, delimiter=';'):
    df = pd.DataFrame(columns=data_columns)
    for _, row in data.iterrows():
        list_of_entries = str.split(row[4], delimiter)
        for entry in list(zip(list_of_entries[::2], list_of_entries[1::2])):
            data = pd.DataFrame([[row[index_col], entry[0], entry[1]]], columns=data_columns)
            df = pd.concat([df, data], ignore_index=True)
    return df

def format_multiple_line_data(data, data_columns, index_col):
    df = pd.DataFrame(columns=data_columns)
    for _, row in data.iterrows():
        if pd.isna(row[4]): continue
        list_of_entries = [str.split(line, ';') for line in str.split(row[4], '#')]
        for entry in list(filter(None, list_of_entries)):
            entry.insert(0, row[index_col])
            entry[-1] = entry[-1].replace('[', '').replace(']', '') 
            entry_data = pd.DataFrame([entry], columns=data_columns)
            df = df.append(entry_data, ignore_index=True)
    return df