Commit 4350076b authored by thecml's avatar thecml
Browse files

updated scripts and notebooks

parent 94e72200
Pipeline #99819 failed with stage
in 1 minute and 2 seconds
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"import yaml\n",
"from pathlib import Path\n",
"import pandas as pd\n",
"import numpy as np\n",
"import paths as pt\n",
"from sklearn.model_selection import train_test_split\n",
"import xgboost as xgb\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from utility import metrics\n",
"from sklearn.metrics import confusion_matrix\n",
"from tools import data_loader, file_writer\n",
"from sklearn.metrics import accuracy_score, precision_score\n",
"from sklearn.metrics import recall_score, roc_auc_score\n",
"\n",
"# Load settings\n",
"with open(Path.joinpath(pt.CONFIGS_DIR, \"complete_emb.yaml\"), 'r') as stream:\n",
" settings = yaml.safe_load(stream)\n",
" \n",
"protected_col_name = 'Gender'\n",
"y_col_name=\"Complete\"\n",
"\n",
"# Load the data\n",
"file_name = \"complete_emb.csv\"\n",
"dl = data_loader.CompleteDataLoader(file_name, settings).load_data()\n",
"X, y = dl.get_data()\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n",
" stratify=y, random_state=0)\n",
"neg, pos = np.bincount(y)\n",
"scale_pos_weight = neg / pos\n",
"\n",
"params = {\"n_estimators\": 400,\n",
" \"objective\": \"binary:logistic\",\n",
" \"scale_pos_weight\": scale_pos_weight,\n",
" \"use_label_encoder\": False,\n",
" \"learning_rate\": 0.1,\n",
" \"eval_metric\": \"logloss\",\n",
" \"seed\": 0\n",
"}\n",
"\n",
"model = xgb.XGBClassifier(**params)\n",
"skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)\n",
"df_test = pd.DataFrame([],columns=list(X.columns)+[\"Complete\"]+[\"output\"]+[\"output_prob\"])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):\n",
" confusion_df = pd.DataFrame(columns=[protected_col_name, \"FPR\", \"FNR\"])\n",
" \n",
" for name in list(df[protected_col_name].unique()):\n",
" a=df[df[protected_col_name]==name][y_target_name]\n",
" b=df[df[protected_col_name]==name][y_pred_name]\n",
"\n",
" TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()\n",
" \n",
" TPR = TP/(TP+FN)\n",
" TNR = TN/(TN+FP) \n",
" PPV = TP/(TP+FP)\n",
" NPV = TN/(TN+FN)\n",
" FPR = FP/(FP+TN)\n",
" FNR = FN/(TP+FN)\n",
" FDR = FP/(TP+FP)\n",
" ACC = (TP+TN)/(TP+FP+FN+TN)\n",
" LRplus=TPR/FPR\n",
" LRminus=FNR/TNR\n",
" F1=2*(PPV*TPR)/(PPV+TPR)\n",
"\n",
" confusion_df = confusion_df.append({protected_col_name:name, \"TPR\":TPR, \"TNR\":TNR, \"FPR\":FPR,\n",
" \"FNR\":FNR, \"PPV\":PPV, \"NPV\":NPV, \"FDR\":FDR, \"ACC\":ACC,\n",
" \"F1\":F1, \"LRplus\":LRplus, \"LRminus\":LRminus, \"TN\":TN,\n",
" \"FP\":FP, \"FN\":FN, \"TP\":TP}, ignore_index=True)\n",
" \n",
" return confusion_df"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"i=0\n",
"y_valid_pred = 0*y\n",
"valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()\n",
"for train_index, valid_index in skf.split(X_train, y_train):\n",
" \n",
" X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]\n",
" y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]\n",
" optimize_rounds = True\n",
" early_stopping_rounds = 50\n",
" \n",
" if optimize_rounds:\n",
" eval_set=[(X_valid_split, y_valid_split)]\n",
" fit_model = model.fit(X_train_split, y_train_split,\n",
" eval_set=eval_set,\n",
" eval_metric=metrics.gini_xgb,\n",
" early_stopping_rounds=early_stopping_rounds,\n",
" verbose=False) \n",
" else:\n",
" fit_model = model.fit(X_train_split, y_train_split)\n",
" \n",
" pred = fit_model.predict_proba(X_valid_split)[:,1]\n",
" y_valid_pred.iloc[valid_index] = pred\n",
" y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)\n",
" \n",
" # Save data\n",
" y_true_pd=y_valid_split.to_frame().reset_index(drop=True)\n",
" y_pred_pd=y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame().reset_index(drop=True).rename(columns={\"Complete\" : \"output\"})\n",
" y_pred_prob_pd = pd.DataFrame(pred, columns = [\"output_prob\"])\n",
" \n",
" df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd, y_pred_pd, y_pred_prob_pd], axis=1)\n",
" df_test = df_test.append(df_subset, ignore_index=True)\n",
"\n",
" # Save metrics\n",
" df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_name, y_col_name, \"output\")\n",
" file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, \"model\"+str(i) + \"_\" + protected_col_name + \".csv\")\n",
" \n",
" df_evaluate_together = df_subset.copy()\n",
" df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, \"output\")\n",
" file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, \"model\"+str(i) + \"_\" + protected_col_name + \"_all.csv\")\n",
" \n",
" valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))\n",
" valid_pre.append(precision_score(y_valid_split, y_valid_scores))\n",
" valid_recall.append(recall_score(y_valid_split, y_valid_scores))\n",
" valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))\n",
" \n",
" i=i+1"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"file_writer.write_csv(df_test, pt.INTERIM_DATA_DIR, \"all_test_data.csv\")"
]
}
],
"metadata": {
"interpreter": {
"hash": "1257d43d6e3967ffdae7723e8889b746915ea50e5b681a3d1d09455fe4a03787"
},
"kernelspec": {
"display_name": "Python 3.8.11 64-bit ('py38-air': conda)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
```
import yaml
from pathlib import Path
import pandas as pd
import numpy as np
import paths as pt
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from utility import metrics
from sklearn.metrics import confusion_matrix
from tools import data_loader, file_writer
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, roc_auc_score
# Load settings
with open(Path.joinpath(pt.CONFIGS_DIR, "complete_emb.yaml"), 'r') as stream:
settings = yaml.safe_load(stream)
protected_col_name = 'Gender'
y_col_name="Complete"
# Load the data
file_name = "complete_emb.csv"
dl = data_loader.CompleteDataLoader(file_name, settings).load_data()
X, y = dl.get_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
stratify=y, random_state=0)
neg, pos = np.bincount(y)
scale_pos_weight = neg / pos
params = {"n_estimators": 400,
"objective": "binary:logistic",
"scale_pos_weight": scale_pos_weight,
"use_label_encoder": False,
"learning_rate": 0.1,
"eval_metric": "logloss",
"seed": 0
}
model = xgb.XGBClassifier(**params)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
df_test = pd.DataFrame([],columns=list(X.columns)+["Complete"]+["output"]+["output_prob"])
```
%% Cell type:code id: tags:
```
def get_df_w_metrics(df, protected_col_name, y_target_name, y_pred_name):
confusion_df = pd.DataFrame(columns=[protected_col_name, "FPR", "FNR"])
for name in list(df[protected_col_name].unique()):
a=df[df[protected_col_name]==name][y_target_name]
b=df[df[protected_col_name]==name][y_pred_name]
TN, FP, FN, TP = confusion_matrix(list(a), list(b),labels=[0, 1]).ravel()
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
PPV = TP/(TP+FP)
NPV = TN/(TN+FN)
FPR = FP/(FP+TN)
FNR = FN/(TP+FN)
FDR = FP/(TP+FP)
ACC = (TP+TN)/(TP+FP+FN+TN)
LRplus=TPR/FPR
LRminus=FNR/TNR
F1=2*(PPV*TPR)/(PPV+TPR)
confusion_df = confusion_df.append({protected_col_name:name, "TPR":TPR, "TNR":TNR, "FPR":FPR,
"FNR":FNR, "PPV":PPV, "NPV":NPV, "FDR":FDR, "ACC":ACC,
"F1":F1, "LRplus":LRplus, "LRminus":LRminus, "TN":TN,
"FP":FP, "FN":FN, "TP":TP}, ignore_index=True)
return confusion_df
```
%% Cell type:code id: tags:
```
i=0
y_valid_pred = 0*y
valid_acc, valid_pre, valid_recall, valid_roc_auc = list(), list(), list(), list()
for train_index, valid_index in skf.split(X_train, y_train):
X_train_split, X_valid_split = X_train.iloc[train_index,:], X_train.iloc[valid_index,:]
y_train_split, y_valid_split = y_train.iloc[train_index], y_train.iloc[valid_index]
optimize_rounds = True
early_stopping_rounds = 50
if optimize_rounds:
eval_set=[(X_valid_split, y_valid_split)]
fit_model = model.fit(X_train_split, y_train_split,
eval_set=eval_set,
eval_metric=metrics.gini_xgb,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
else:
fit_model = model.fit(X_train_split, y_train_split)
pred = fit_model.predict_proba(X_valid_split)[:,1]
y_valid_pred.iloc[valid_index] = pred
y_valid_scores = (y_valid_pred.iloc[valid_index] > 0.5)
# Save data
y_true_pd=y_valid_split.to_frame().reset_index(drop=True)
y_pred_pd=y_valid_scores.apply(lambda x: 1 if x == True else 0).to_frame().reset_index(drop=True).rename(columns={"Complete" : "output"})
y_pred_prob_pd = pd.DataFrame(pred, columns = ["output_prob"])
df_subset = pd.concat([X_valid_split.reset_index(drop=True), y_true_pd, y_pred_pd, y_pred_prob_pd], axis=1)
df_test = df_test.append(df_subset, ignore_index=True)
# Save metrics
df_evaluate_proc = get_df_w_metrics(df_subset, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_proc, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + ".csv")
df_evaluate_together = df_subset.copy()
df_evaluate_all = get_df_w_metrics(df_evaluate_together, protected_col_name, y_col_name, "output")
file_writer.write_csv(df_evaluate_all, pt.INTERIM_DATA_DIR, "model"+str(i) + "_" + protected_col_name + "_all.csv")
valid_acc.append(accuracy_score(y_valid_split, y_valid_scores))
valid_pre.append(precision_score(y_valid_split, y_valid_scores))
valid_recall.append(recall_score(y_valid_split, y_valid_scores))
valid_roc_auc.append(roc_auc_score(y_valid_split, y_valid_pred.iloc[valid_index]))
i=i+1
```
%% Cell type:code id: tags:
```
file_writer.write_csv(df_test, pt.INTERIM_DATA_DIR, "all_test_data.csv")
```
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n",
"file = 'C:\\\\Users\\\\cml\\\\Downloads\\\\AIR export\\\\Aalborg\\\\Hjælpemidler minus 50_Rasmus_Details-CPR.xlsx'\n",
"df = pd.read_excel(file, engine='openpyxl', converters={'ID': str, 'Kategori ISO nummer': str})\n",
"\n",
"df['Seq'] = df.groupby(['ID', 'Kategori ISO nummer']).cumcount()\n",
"df = df[['ID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Kørselsdato', 'Seq']]\n",
"\n",
"df['LendDate'] = df.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 0 else pd.NaT, axis=1)\n",
"df['ReturnDate'] = df.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 1 else pd.NaT, axis=1)\n",
"\n",
"df['ReturnDate'] = df.groupby(['ID', 'Kategori ISO nummer'])['ReturnDate'].shift(-1)\n",
"df = df.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)\n",
"\n",
"#df['ReturnDate'] = df['ReturnDate'].shift(-1)\n",
"#df = df.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)\n",
"#df = df.drop(['Kørselsdato', 'Seq'], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Birth Year</th>\n",
" <th>Gender</th>\n",
" <th>Kategori ISO nummer</th>\n",
" <th>Kørselsdato</th>\n",
" <th>Seq</th>\n",
" <th>LendDate</th>\n",
" <th>ReturnDate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2429541786</td>\n",
" <td>23</td>\n",
" <td>FEMALE</td>\n",
" <td>22271812</td>\n",
" <td>06/08/19</td>\n",
" <td>0</td>\n",
" <td>06/08/19</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2429541786</td>\n",
" <td>23</td>\n",
" <td>FEMALE</td>\n",
" <td>12060611</td>\n",
" <td>19/02/18</td>\n",
" <td>0</td>\n",
" <td>19/02/18</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2429541786</td>\n",
" <td>23</td>\n",
" <td>FEMALE</td>\n",
" <td>12072401</td>\n",
" <td>19/02/18</td>\n",
" <td>0</td>\n",
" <td>19/02/18</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2430269034</td>\n",
" <td>26</td>\n",
" <td>FEMALE</td>\n",
" <td>22271812</td>\n",
" <td>09/03/20</td>\n",
" <td>0</td>\n",
" <td>09/03/20</td>\n",
" <td>09/11/20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2430269034</td>\n",
" <td>26</td>\n",
" <td>FEMALE</td>\n",
" <td>12362124</td>\n",
" <td>14/10/19</td>\n",
" <td>0</td>\n",
" <td>14/10/19</td>\n",
" <td>29/07/21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72044</th>\n",
" <td>74711770410</td>\n",
" <td>99</td>\n",
" <td>FEMALE</td>\n",
" <td>18301509</td>\n",
" <td>13/06/19</td>\n",
" <td>0</td>\n",
" <td>13/06/19</td>\n",
" <td>07/09/21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72046</th>\n",
" <td>74711770410</td>\n",
" <td>99</td>\n",
" <td>FEMALE</td>\n",
" <td>12220308</td>\n",
" <td>28/09/21</td>\n",
" <td>0</td>\n",
" <td>28/09/21</td>\n",
" <td>07/09/21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72048</th>\n",
" <td>74711770410</td>\n",
" <td>99</td>\n",
" <td>FEMALE</td>\n",
" <td>12220308</td>\n",
" <td>25/08/16</td>\n",
" <td>2</td>\n",
" <td>25/08/16</td>\n",
" <td>29/09/21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72050</th>\n",
" <td>74711770410</td>\n",
" <td>99</td>\n",
" <td>FEMALE</td>\n",
" <td>04330301</td>\n",
" <td>07/09/21</td>\n",
" <td>0</td>\n",
" <td>07/09/21</td>\n",
" <td>28/09/21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72051</th>\n",
" <td>74711770410</td>\n",
" <td>99</td>\n",
" <td>FEMALE</td>\n",
" <td>99999999</td>\n",
" <td>28/09/21</td>\n",
" <td>0</td>\n",
" <td>28/09/21</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>48292 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" ID Birth Year Gender Kategori ISO nummer Kørselsdato Seq \\\n",
"0 2429541786 23 FEMALE 22271812 06/08/19 0 \n",
"1 2429541786 23 FEMALE 12060611 19/02/18 0 \n",
"2 2429541786 23 FEMALE 12072401 19/02/18 0 \n",
"3 2430269034 26 FEMALE 22271812 09/03/20 0 \n",
"5 2430269034 26 FEMALE 12362124 14/10/19 0 \n",
"... ... ... ... ... ... ... \n",
"72044 74711770410 99 FEMALE 18301509 13/06/19 0 \n",
"72046 74711770410 99 FEMALE 12220308 28/09/21 0 \n",
"72048 74711770410 99 FEMALE 12220308 25/08/16 2 \n",
"72050 74711770410 99 FEMALE 04330301 07/09/21 0 \n",
"72051 74711770410 99 FEMALE 99999999 28/09/21 0 \n",
"\n",
" LendDate ReturnDate \n",
"0 06/08/19 NaN \n",
"1 19/02/18 NaN \n",
"2 19/02/18 NaN \n",
"3 09/03/20 09/11/20 \n",
"5 14/10/19 29/07/21 \n",
"... ... ... \n",
"72044 13/06/19 07/09/21 \n",
"72046 28/09/21 07/09/21 \n",
"72048 25/08/16 29/09/21 \n",
"72050 07/09/21 28/09/21 \n",
"72051 28/09/21 NaN \n",
"\n",
"[48292 rows x 8 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
}
],
"metadata": {
"interpreter": {
"hash": "59ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
},
"kernelspec": {
"display_name": "Python 3.8.8 64-bit ('py38-air': conda)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
file = 'C:\\Users\\cml\\Downloads\\AIR export\\Aalborg\\Hjælpemidler minus 50_Rasmus_Details-CPR.xlsx'
df = pd.read_excel(file, engine='openpyxl', converters={'ID': str, 'Kategori ISO nummer': str})
df['Seq'] = df.groupby(['ID', 'Kategori ISO nummer']).cumcount()
df = df[['ID', 'Birth Year', 'Gender', 'Kategori ISO nummer', 'Kørselsdato', 'Seq']]
df['LendDate'] = df.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 0 else pd.NaT, axis=1)
df['ReturnDate'] = df.apply(lambda x: x['Kørselsdato'] if x['Seq'] % 2 == 1 else pd.NaT, axis=1)
df['ReturnDate'] = df.groupby(['ID', 'Kategori ISO nummer'])['ReturnDate'].shift(-1)
df = df.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)
#df['ReturnDate'] = df['ReturnDate'].shift(-1)
#df = df.dropna(subset=['LendDate', 'ReturnDate'], thresh=1)
#df = df.drop(['Kørselsdato', 'Seq'], axis=1)
```
%% Cell type:code id: tags:
``` python
df
```
%%%% Output: execute_result
ID Birth Year Gender Kategori ISO nummer Kørselsdato Seq \
0 2429541786 23 FEMALE 22271812 06/08/19 0
1 2429541786 23 FEMALE 12060611 19/02/18 0
2 2429541786 23 FEMALE 12072401 19/02/18 0
3 2430269034 26 FEMALE 22271812 09/03/20 0
5 2430269034 26 FEMALE 12362124 14/10/19 0
... ... ... ... ... ... ...
72044 74711770410 99 FEMALE 18301509 13/06/19 0
72046 74711770410 99 FEMALE 12220308 28/09/21 0
72048 74711770410 99 FEMALE 12220308 25/08/16 2
72050 74711770410 99 FEMALE 04330301 07/09/21 0
72051 74711770410 99 FEMALE 99999999 28/09/21 0
LendDate ReturnDate
0 06/08/19 NaN
1 19/02/18 NaN
2 19/02/18 NaN
3 09/03/20 09/11/20
5 14/10/19 29/07/21
... ... ...
72044 13/06/19 07/09/21
72046 28/09/21 07/09/21
72048 25/08/16 29/09/21
72050 07/09/21 28/09/21
72051 28/09/21 NaN