feature_builder.py 16.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
# !/usr/bin/env python

"""
Authors: Cecilie Moriat, Tenna Rasmussen, Christian Fischer Pedersen

Date: 20th March, 2020
"""

# Imports
# Internal
11
import src.config as cfg
12
import src.utils.utility as ut
13
14
15
16

# External
import pandas as pd
import numpy as np
17
import datetime as dt
18
from collections import namedtuple
19

20
21
def create_features(data, n_weeks_min=None, n_weeks_max=None,
single_sample=True, only_first_and_last_sv=False,
22
 load_cluster=False, test_mode=False):
23
    print("Started extracting features")
24
    
25
    # Set min and max length
26
27
28
    min_length = n_weeks_min if n_weeks_min is not None else 0
    max_length = n_weeks_max if n_weeks_max is not None else 0

29
    if only_first_and_last_sv:
30
31
32
        min_length = 0
        max_length = None

thecml's avatar
thecml committed
33
    # Allocating space for the features
34
35
    features = pd.DataFrame()

36
37
    # Get union of ids
    ids = ut.create_union_of_ids(data.sv, data.ss, data.td, data.tc)
38

39
    # Number of citizens with less than 2 screenings
40
    n_citizens_less_two_sv = 0
41

42
    # Create window features for ids
43
    for id in ids:
44
45
        window_features = create_window_features(id, data, n_citizens_less_two_sv,
         only_first_and_last_sv, min_length, max_length, single_sample, load_cluster)
46

thecml's avatar
thecml committed
47
48
        features = pd.concat([features, window_features], axis=0, ignore_index=True)
    
49
    # Save the processed files and features to disk
50
    if not test_mode:
51
        features.to_csv(Path.joinpath(cfg.PROCESSED_DATA_DIR, "AIRfeatures.csv"),
52
         sep=";", index=False, date_format="%d-%m-%Y", na_rep='NA')
53

54
55
56
57
58
59
60
    ## Fejler her:
    # Generate Descriptive statistics and save to disk
    # features.describe(include="all").to_csv(Path.joinpath(MODEL_DIR, "DescriptionOfAIRFeatures.csv"))
    # Generate Descriptive statistics sorted by development in "Need of Assistence"  and save to disk
    # groupedbyneeds = features.groupby(["Needs"])
    # groupedbyneeds.describe().to_csv(Path.joinpath(MODEL_DIR, "GroupedByDescriptionOfAIRFeatures.csv"))

61
62
    print("Finished extracting features")

thecml's avatar
thecml committed
63
    return features
64

65
66
def create_window_features(id, data, n_citizens_less_two_sv, only_first_and_last_sv,
 min_length, max_length, single_sample, load_cluster):
thecml's avatar
thecml committed
67
68
    id_features = pd.DataFrame()
    
69
    # Make copy of this specific person's values from data tuple
70
71
72
73
74
75
    ptd_id = data.ptd.loc[data.ptd[cfg.CITIZEN_ID] == id]
    sv_id = data.sv.loc[data.sv[cfg.CITIZEN_ID] == id]
    ss_id = data.ss.loc[data.ss[cfg.CITIZEN_ID] == id]
    td_id = data.td.loc[data.td[cfg.CITIZEN_ID] == id]
    tc_id = data.tc.loc[data.tc[cfg.CITIZEN_ID] == id]
    ats_id = data.ats.loc[data.ats[cfg.CITIZEN_ID] == str(id)]
76
77

    if sv_id.empty or len(sv_id) < 2:  # If there is no screening record just jump to next BorgerID.
78
        n_citizens_less_two_sv = n_citizens_less_two_sv + 1
thecml's avatar
thecml committed
79
80
        return

81
    # Regular screening window procedure
82
83
84
85
    pre_screening = sv_id.iloc[0]

    ## The last screening for this citizen
    screening_last = sv_id.iloc[-1]
86
    
thecml's avatar
thecml committed
87
    # Looking at the screenings and take the windows prior to this
88
    for i, screening in enumerate(sv_id.itertuples(index=False)):
thecml's avatar
thecml committed
89
        
90
        # Skipping the loop if the first screening as this will be the ini
91
92
93
94
95
96
        if i == 0:
            continue

        # end date of the current screening interval
        end_date = screening.ScreeningDate

97
        # if only_first_and_last_sc then use last screening
98
        if only_first_and_last_sv:
99
            end_date = screening_last.ScreeningDate
thecml's avatar
thecml committed
100

101
        # length of screening interval
102
        n_weeks = np.around((end_date - pre_screening.ScreeningDate).days / 7, decimals=1)
thecml's avatar
thecml committed
103

104
105
106
        # Finding the start date for this window
        start_date = pre_screening.ScreeningDate

107
        if only_first_and_last_sv:
108
109
            if n_weeks <= 0:
                break
110
        elif max_length is None:
111
112
            if n_weeks <= 0 or n_weeks < min_length:
                continue
113
114
        else:
            # only include intervals of at least 12 weeks and max 6 months (or as specified)
115
116
            if max_length < n_weeks or n_weeks < min_length:
                continue
thecml's avatar
thecml committed
117
        
118
        # create window and assign features that are not time dependent
119
120
121
122
        patient_id = sv_id[cfg.PATIENT_ID].iloc[0]
        sex = 0 if (sv_id[cfg.SEX].iloc[0] == 'Kvinde') else 1
        birth_year = int(sv_id[cfg.BIRTH_YEAR].iloc[0]) + 1900
        window_features = pd.DataFrame([{cfg.CITIZEN_ID: id, cfg.PATIENT_ID: patient_id, cfg.SEX: sex, cfg.BIRTH_YEAR: birth_year}])
123
        window_features['Age'] = get_age_at_start(pre_screening, birth_year)
thecml's avatar
thecml committed
124
125

        # Data for this particular window
126
127
128
129
        tdw = td_id.loc[(td_id[cfg.RATING_DATE] <= end_date) & (td_id[cfg.RATING_DATE] >= start_date)]
        tcw = tc_id.loc[(tc_id[cfg.RATING_DATE] <= end_date) & (tc_id[cfg.RATING_DATE] >= start_date)]
        ssw = ss_id.loc[(ss_id[cfg.CHANGE_DATE] <= end_date) & (ss_id[cfg.CHANGE_DATE] >= start_date)]
        huw = ats_id.loc[(ats_id[cfg.LEND_DATE] <= end_date) & (ats_id[cfg.LEND_DATE] >= start_date)]
130
131
132

        # Set rating date as start date for window if it exists
        start_date = tdw.RatingDate.iloc[0] if not tdw.empty else start_date
thecml's avatar
thecml committed
133
134

        # Calculate and assign evaluations
135
136
137
        #StartDate, EndDate
        window_features['StartMonth'] = dt.datetime.strptime(str(start_date), '%Y-%m-%d %H:%M:%S').date().month
        window_features['EndMonth'] = dt.datetime.strptime(str(end_date), '%Y-%m-%d %H:%M:%S').date().month
thecml's avatar
thecml committed
138
        window_features['nWeeks'] = n_weeks
139
140
141
142
143
144
145
        window_features['MeanEvaluation'] = get_mean_evaluation(tdw)
        window_features['StdEvaluation'] = get_std_evaluation(tdw)
        window_features['MinEvaluation'] = get_min_evaluation(tdw)
        window_features['MaxEvaluation'] = get_max_evaluation(tdw)
        window_features['nTraining'] = get_n_training_window(tdw)
        window_features['nTrainingOptimal'] = get_n_train_optimal(n_weeks)
        window_features['nTrainingPrWeek'] = get_n_trainings_per_week(n_weeks, get_n_training_window(tdw))
thecml's avatar
thecml committed
146
147

        # Calculate and assign base training data
148
        training_pr_week = get_training_per_week(tdw, start_date)
149
        n_weeks_with_training = get_n_weeks_with_trainings(start_date, tdw)
150
        window_features['nTrainingPrWeekMax'] = get_n_training_per_week_max(training_pr_week)
151
        window_features['nTrainingPrWeekMin'] = get_n_training_per_week_min(training_pr_week, n_weeks_with_training, n_weeks)
152
        window_features['nWeeksWithTrainings'] = n_weeks_with_training
153
        window_features['TimeBetweenTrainingsAvg'] = get_avg_time_between_trainings(tdw)
thecml's avatar
thecml committed
154

155
156
157
158
        # Calculate if citizen has had a successful program by training weeks
        successful_program_start = 1 if get_n_weeks_with_training_first_12(tdw, start_date) >= 8 else 0
        successful_program_end = 1 if get_n_weeks_with_training_last_12(tdw, start_date, end_date) >= 8 else 0
        window_features['nWeeksWithTrainingsIn12Weeks'] = get_n_weeks_with_training_first_12(tdw, start_date)
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
159
160
161
        window_features['SP-START'] = successful_program_start
        window_features['SP-END'] = successful_program_end
        window_features['SP-ALL'] = get_successful_program_all(td_id)
162

thecml's avatar
thecml committed
163
164
        # Calculate and assign training cancellations
        n_cancel = tcw.shape[0]
165
        cancelsprweek = tcw[cfg.RATING_DATE].apply(lambda x: "%d/%d" % (x.week, x.year))
166
        window_features['nWeeksWithoutTrainings'] = get_n_weeks_without_training(n_weeks, n_weeks_with_training)
thecml's avatar
thecml committed
167
        window_features['nCancellations'] = n_cancel
168
        window_features['TimeBetweenCancelsAvg'] = get_avg_time_between_cancels(tcw, n_cancel)
169
        window_features['nCancellationsPrWeekAVG'] = round(n_cancel / n_weeks, 2)
170
171
        window_features['nCancellationsPrWeekMax'] = get_n_cancel_per_week_min(cancelsprweek)
        window_features['nCancellationsPrWeekMin'] = get_n_training_per_week_max(cancelsprweek)
thecml's avatar
thecml committed
172

173
        # Calculate and assign assistive aids
174
        window_features['NumberATsRunning'] = get_number_of_ats_running(ats_id, end_date)
175
        window_features['NewAts'] = pd.Series([get_new_at(huw)])
176
177

        if load_cluster:
178
            cluster = ptd_id.iloc[0]['Cluster']
179
180
            window_features['Cluster'] = cluster
        
181
        devices_count = get_devices_count(ats_id, start_date)
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
        devices_unique = pd.Series(devices_count.unique())

        # Get significant devices according to DigiRehab
        has_rollator = devices_unique.str.contains("120606").any()
        has_raised_toilet_Seat = devices_unique.str.contains('091212|091215|091218', regex=True).any()
        has_shower_stool = devices_unique.str.contains('093307').any()
        has_raised_toilet_Seat_and_shower_stool = has_raised_toilet_Seat and has_shower_stool
       
        window_features['HasRollator'] = has_rollator
        window_features['HasRaisedToiletSeat'] = has_raised_toilet_Seat
        window_features['HasShowerStool'] = has_shower_stool
        window_features['HasRaisedToiletSeatAndShowerStool'] = has_raised_toilet_Seat_and_shower_stool

        devices_count = devices_count.astype(str).str[:6]
        devices_unique = pd.Series(devices_count.unique())
197
198
        window_features['DevicesCount'] = pd.Series(0).tolist() if not devices_count.any() else devices_count.tolist()
        window_features['DevicesUnique'] = pd.Series(0).tolist() if not devices_unique.any() else devices_unique.tolist()
199

thecml's avatar
thecml committed
200
201
        # Calculate and assign needs
        window_features['NeedsStart'] = pre_screening.NeedForHelpScore
202
        #window_features['NeedsStartReason'] = get_needs_reason(pre_screening) TODO: Decide to include
thecml's avatar
thecml committed
203
        window_features['NeedsEnd'] = screening.NeedForHelpScore
204
        window_features['NeedsDifference'] = screening.NeedForHelpScore - pre_screening.NeedForHelpScore
205
        #window_features['NeedsReason'] = get_needs_reason(screening) TODO: Decide to include
206
        window_features['Needs'] = get_needs_indicator(pre_screening.NeedForHelpScore, screening.NeedForHelpScore)
thecml's avatar
thecml committed
207
208
209

        # Calculate and assign physics
        window_features['PhysicsStart'] = pre_screening.PhysicalStrengthScore
210
        #window_features['PhysicsStartReason'] = get_physics_reason(pre_screening) TODO: Decide to include
thecml's avatar
thecml committed
211
        window_features['PhysicsEnd']= screening.PhysicalStrengthScore
212
        window_features['PhysicsDifference'] = screening.PhysicalStrengthScore - pre_screening.PhysicalStrengthScore
213
        #window_features['PhysicsReason'] = get_physics_reason(screening) TODO: Decide to include
214
        window_features['Physics'] = get_physics_indicator(
thecml's avatar
thecml committed
215
216
217
            pre_screening.PhysicalStrengthScore, screening.PhysicalStrengthScore)

        # Calculate and assign misc
218
219
        window_features['RehabIndicator'] = get_rehab_indicator(pre_screening.NeedForHelpScore,
         pre_screening.PhysicalStrengthScore)
220
        window_features['Exercises'] = pd.Series([get_exercises(pre_screening)])
221
        window_features['LastStatusMonth'] = get_last_status_month(ssw, '%Y-%m-%d %H:%M:%S')
222
        window_features['LastStatus'] = get_last_status(ssw)
thecml's avatar
thecml committed
223

224
        # Add window to total data for id
thecml's avatar
thecml committed
225
        id_features = pd.concat([id_features, window_features], axis=0, ignore_index=True)
226

227
        if single_sample or only_first_and_last_sv:
228
229
230
231
232
            break

        # Set current screening as reference for next interval
        pre_screening = screening

thecml's avatar
thecml committed
233
234
    return id_features

235
236
237
def get_rehab_indicator(needs_start, physics_start):
    return needs_start / physics_start

238
def get_devices_count(hu_id, start_date):
239
    return hu_id[cfg.DEV_ISO_CLASS][(hu_id.LendDate <= start_date)
240
241
242
     & ((start_date < hu_id.ReturnDate) | (hu_id.ReturnDate.isna()))]

def get_n_weeks_with_training_first_12(tdw, start_date):
243
    return tdw[cfg.RATING_DATE].apply(
244
245
246
247
248
                lambda x: (np.floor((x - start_date).days / 7))
                if (x - start_date).days <= 84
                else np.nan).dropna().nunique()

def get_n_weeks_with_training_last_12(tdw, start_date, end_date):
249
    return tdw[cfg.RATING_DATE].apply(
250
251
252
253
                lambda x: (np.floor((x - start_date).days / 7))
                if (end_date - x).days <= 84
                else np.nan).dropna().nunique()

254
255
def get_successful_program_all(td_id):
    successful_program_all = 0
256
257
    if not td_id.empty:
        first_training = td_id.RatingDate.iloc[0]
258
        n_weeks_with_trainings_all = td_id[cfg.RATING_DATE].apply(lambda x: np.floor((x - first_training).days / 7)).nunique()
259
        if n_weeks_with_trainings_all >= 8:
260
            weeks_of_training = td_id[cfg.RATING_DATE].apply(lambda x: np.floor((x - first_training).days / 7)).unique()
261
262
263
264
265
            for i, week in enumerate(weeks_of_training):
                weeks_left = len(weeks_of_training) - i
                if weeks_left < 8:
                    break
                if (weeks_of_training[i + 7] - week) <= 12:
266
                    successful_program_all = 1
267
                    break
268
    return successful_program_all
269

270
271
272
273
274
275
276
277
278
279
280
281
282
283
def get_needs_reason(screening):
    needs_reason = screening.NeedForHelpReason
    if is_not_blank(str(needs_reason)):
        return needs_reason
    else:
        return np.nan

def get_physics_reason(screening):
    physics_reason = screening.PhysicalStrengthReason
    if is_not_blank(str(physics_reason)):
        return physics_reason
    else:
        return np.nan

284
def get_last_status(ssw):
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
285
    if ssw.empty:
286
        return 'None'
Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
287
    else:
288
        return ssw[cfg.STATUS].iat[-1]
thecml's avatar
thecml committed
289

290
def get_last_status_month(ssw, date_format):
291
    if ssw.empty:
292
        return 0
293
    else:
294
295
        date_time_obj = dt.datetime.strptime(str(ssw[cfg.CHANGE_DATE].iat[-1]), date_format)
        return date_time_obj.date().month
thecml's avatar
thecml committed
296

297
def get_new_at(huw):
298
299
300
    if huw.empty:
        return np.nan
    else:
301
        return [at for at in huw[cfg.DEV_ISO_CLASS]]
thecml's avatar
thecml committed
302

303
def get_number_of_ats_running(hui, end_date):
304
    return hui.loc[(hui[cfg.LEND_DATE] <= end_date)].shape[0]
thecml's avatar
thecml committed
305

Christian Marius Lillelund's avatar
Christian Marius Lillelund committed
306
def get_exercises(screening):
307
    return [exercise for exercise in screening[cfg.EXERCISE_CONTENT].split(",")]
thecml's avatar
thecml committed
308

309
def get_physics_indicator(physics_start, physics_end):
thecml's avatar
thecml committed
310
    return 1 if physics_end > physics_start else 0  # 1 is a positive change
thecml's avatar
thecml committed
311

312
def get_needs_indicator(needs_start, needs_end, dr_threshold = 24):
thecml's avatar
thecml committed
313
    return 1 if needs_end < (needs_start - dr_threshold) else 0  # 1 is a positive change
thecml's avatar
thecml committed
314

315
def get_n_cancel_per_week_min(cancelsprweek):
thecml's avatar
thecml committed
316
    return cancelsprweek.value_counts().min() if not cancelsprweek.empty else 0
thecml's avatar
thecml committed
317

318
def get_n_weeks_without_training(n_weeks, n_weeks_with_trainings):
thecml's avatar
thecml committed
319
    return max(0, (np.ceil(n_weeks) - n_weeks_with_trainings))
thecml's avatar
thecml committed
320

321
def get_n_weeks_with_trainings(start_date, tdw):
322
    return tdw[cfg.RATING_DATE].apply(lambda x: np.floor((x - start_date).days / 7)).nunique()
thecml's avatar
thecml committed
323

324
def get_training_per_week(tdw, start_date):
325
    return tdw[cfg.RATING_DATE].apply(lambda x: np.floor((x - start_date).days / 7))
thecml's avatar
thecml committed
326

327
def get_n_training_per_week_max(training_pr_week):
thecml's avatar
thecml committed
328
329
330
331
    if not training_pr_week.empty:
        return training_pr_week.value_counts().max()
    else:
        return 0
thecml's avatar
thecml committed
332

333
def get_n_training_per_week_min(training_pr_week, n_weeks_with_trainings, n_weeks):
thecml's avatar
thecml committed
334
335
336
337
338
    if not training_pr_week.empty and n_weeks_with_trainings > n_weeks:
        return training_pr_week.value_counts().min()
    else:
        return 0

339
def get_n_trainings_per_week(n_weeks, n_training_window):
thecml's avatar
thecml committed
340
341
342
    n_trainings_per_week = np.around(n_training_window / n_weeks, decimals=1)
    return n_trainings_per_week

343
def get_n_train_optimal(n_weeks):
344
    n_optimal = np.ceil(n_weeks * 2) # The optimal number of trainings is 2 per week
thecml's avatar
thecml committed
345
346
    return n_optimal

347
def get_n_training_window(tdw):
thecml's avatar
thecml committed
348
349
350
    n_training_window = tdw.shape[0] if not tdw.empty else 0
    return n_training_window

351
def get_max_evaluation(tdw):
352
    max_evaluation = np.around(tdw[cfg.RATING_SCORE].max(axis=0), decimals=1)
thecml's avatar
thecml committed
353
354
    return max_evaluation

355
def get_min_evaluation(tdw):
356
    min_evaluation = np.around(tdw[cfg.RATING_SCORE].min(axis=0), decimals=1)
thecml's avatar
thecml committed
357
358
    return min_evaluation

359
def get_std_evaluation(tdw):
360
    std_evaluation = np.around(tdw[cfg.RATING_SCORE].std(axis=0), decimals=1)
thecml's avatar
thecml committed
361
362
    return std_evaluation

363
def get_mean_evaluation(tdw):
364
    mean_evaluation = np.around(tdw[cfg.RATING_SCORE].mean(axis=0), decimals=1)
thecml's avatar
thecml committed
365
366
    return mean_evaluation

367
def get_age_at_start(pre_screening, birth_year):
368
    start_year = pd.to_datetime(pre_screening[cfg.SCREENING_DATE]).year
thecml's avatar
thecml committed
369
370
371
    age = start_year - birth_year
    return age

372
def get_avg_time_between_trainings(tdw, n_decimals=2):
373
    time_between_trainings_dif = tdw[cfg.RATING_DATE].diff().apply(lambda x: x.days) if not tdw.empty else None
374
375
376
377
378
379
    if time_between_trainings_dif is not None:
        return round(time_between_trainings_dif.iloc[1:].mean(), n_decimals)
    else:
        return 0

def get_avg_time_between_cancels(tcw, n_cancel, n_decimals=2):
380
    time_between_cancels_dif = tcw[cfg.RATING_DATE].diff().apply(lambda x: x.days) if not tcw.empty else None
381
382
383
384
385
386
387
    if time_between_cancels_dif is not None:
        if n_cancel > 1:
            return round(time_between_cancels_dif.iloc[1:].mean(), n_decimals)
        else:
            return 0
    else:
        return 0
388

389
390
def is_not_blank(s):
    return bool(s and s.strip())