Skip to content
Snippets Groups Projects
Commit ad16f606 authored by thecml's avatar thecml
Browse files

made alarm script, adjusted some settings

parent 86e3af59
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
from tools import file_reader, file_writer, preprocessor
from pathlib import Path
import pandas as pd
import numpy as np
import paths as pt
import yaml
def main():
df = file_reader.read_pickle(pt.INTERIM_DATA_DIR, 'ats.pkl')
with open(Path.joinpath(pt.CONFIGS_DIR, "data.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
ats_delimiter = settings['ats_delimiter']
df['DevISOClass'] = df['DevISOClass'].apply(lambda x: x[:ats_delimiter]) # limit ats class
df = df[['CitizenId', 'BirthYear', 'Gender', 'LendDate', 'ReturnDate', 'DevISOClass']]
df = df.fillna(df.LendDate.max()) # replace invalid return dates with latest obs lend date
df = df.loc[df['ReturnDate'] >= df['LendDate']] # return date must same or later than lend date
# Merge loans based on ats, lend date and return date
df = df.reset_index(drop=True).sort_values(by=['CitizenId', 'LendDate'])
subset_cols = ['CitizenId', 'DevISOClass']
mask_first = ~df.duplicated(subset=subset_cols, keep='first')
mask_last = ~df.duplicated(subset=subset_cols, keep='last')
hu_first = df[mask_first].loc[:, subset_cols + ['LendDate']]
hu_last = df[mask_last].loc[:, ['CitizenId', 'BirthYear',
'Gender', 'DevISOClass',
'ReturnDate']]
merged = pd.merge(hu_first, hu_last, on=subset_cols)[['CitizenId', 'BirthYear',
'Gender', 'DevISOClass',
'LendDate', 'ReturnDate']]
df = merged.reset_index().sort_values(['CitizenId', 'LendDate']).drop(['index'], axis=1)
# Calculate time diff between lends
df['DeltaLends'] = df.sort_values(['CitizenId', 'LendDate'])\
.groupby(['CitizenId'])['LendDate']\
.diff().dt.days.fillna(0).astype(int)
# Tag alarm lends, save alarm citizens and filter subsequent lends
alarm_ats = "222718"
df['IsAlarmLend'] = df.apply(lambda x: 1 if alarm_ats in x['DevISOClass'] else 0, axis=1)
alarm_citizen_ids = list(df.loc[df['IsAlarmLend'] == 1]['CitizenId'])
alarm_dict = dict(df.loc[df['IsAlarmLend'] == 1][['CitizenId', 'DeltaLends']].values)
df['GetsAlarm'] = df['CitizenId'].apply(lambda x: 1 if x in alarm_citizen_ids else 0)
df = df[df.groupby('CitizenId').IsAlarmLend.transform(lambda s: s.ne(1).cumprod().astype(bool))]
# Make features
lends = df[['CitizenId', 'DevISOClass', 'LendDate', 'ReturnDate']]
lends['LendDiff'] = lends['LendDate'] - lends['ReturnDate']
loan_period = lends.groupby('CitizenId')['LendDiff'].apply(
lambda x: abs(x.mean().total_seconds()) // (24 * 3600)).reset_index()
number_ats = lends.groupby('CitizenId')['DevISOClass'].count().reset_index()
ats_concat = lends.groupby('CitizenId')['DevISOClass'].apply(','.join).reset_index()
max_lend_date = lends.groupby('CitizenId').apply(lambda x: x['LendDate'].max()).reset_index()
max_return_date = lends.groupby('CitizenId').apply(lambda x: x['ReturnDate'].max()).reset_index()
loan_period = loan_period.rename(columns={'LendDiff':'LoanPeriod'})
number_ats = number_ats.rename(columns={'DevISOClass':'NumberAts'})
ats_concat = ats_concat.rename(columns={'DevISOClass':'Ats'})
max_lend_date = max_lend_date.rename(columns={0:'MaxLendDate'})
max_return_date = max_return_date.rename(columns={0: 'MaxReturnDate'})
df = df.drop_duplicates(subset=['CitizenId']).reset_index(drop=True)
df = df.drop(['DevISOClass', 'DeltaLends', 'IsAlarmLend', 'LendDate', 'ReturnDate'], axis=1)
# Merge dataframes
df = df.set_index('CitizenId')
loan_period = loan_period.set_index('CitizenId')
ats_concat = ats_concat.set_index('CitizenId')
number_ats = number_ats.set_index('CitizenId')
max_lend_date = max_lend_date.set_index('CitizenId')
max_return_date = max_return_date.set_index('CitizenId')
df = pd.concat([df, loan_period, ats_concat, number_ats,
max_lend_date, max_return_date], axis=1, sort=False).reset_index()
# Calculate delta between lend and return date
df['DeltaLendReturn'] = (df['MaxReturnDate'] - df['MaxLendDate']).dt.days
# Update dataframe with citizens who get an alarm
def apply_delta_alarm(citizen_id, alarm_dict):
if citizen_id in alarm_dict:
return alarm_dict[citizen_id]
else:
return 0
df['DeltaAlarm'] = df.apply(lambda x: apply_delta_alarm(x['CitizenId'], alarm_dict), axis=1)
# Sort citizens between alarm and no alarm
df_gets_alarm = df.loc[df['GetsAlarm'] == 1][['CitizenId', 'DeltaAlarm']]
df_gets_no_alarm = df.loc[df['GetsAlarm'] == 0][['CitizenId', 'DeltaLendReturn']]
# Assign event and merge citizens
y_df = pd.DataFrame()
df_gets_alarm = df_gets_alarm.reset_index(drop=True)
df_gets_alarm = df_gets_alarm.rename(columns={'DeltaAlarm': 'Days'})
df_gets_alarm['Event'] = pd.Series([True for _ in range(len(df_gets_alarm.index))])
y_df = y_df.append(df_gets_alarm)
df_gets_no_alarm = df_gets_no_alarm.reset_index(drop=True)
df_gets_no_alarm = df_gets_no_alarm.rename(columns={'DeltaLendReturn': 'Days'})
df_gets_no_alarm['Event'] = pd.Series([False for _ in range(len(df_gets_no_alarm.index))])
y_df = y_df.append(df_gets_no_alarm)
y_df = y_df[['CitizenId', 'Event', 'Days']]
# Remove aux variables from x_df
df = df[['CitizenId', 'BirthYear', 'Gender',
'LoanPeriod', 'NumberAts', 'Ats']]
# Sort X and y by citizen id
x_df = df.sort_values(by='CitizenId').reset_index(drop=True)
y_df = y_df.sort_values(by='CitizenId').reset_index(drop=True)
# Drop citizen id
x_df = x_df.drop('CitizenId', axis=1)
y_df = y_df.drop('CitizenId', axis=1)
# Prepare data y and x
ats_resolution = settings['ats_resolution']
data_y = np.array(list(tuple(x) for x in y_df.to_numpy()),
dtype=[('Status', 'bool'), ('Days_to_alarm', '>i4')])
data_x = preprocessor.split_cat_columns(x_df, col_to_split='Ats', tag='Ats',
resolution=ats_resolution)
file_writer.write_array(data_y, pt.PROCESSED_DATA_DIR, "alarm_labels.npy")
file_writer.write_csv(data_x, pt.PROCESSED_DATA_DIR, "alarm_features.csv")
if __name__ == "__main__":
main()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment