MLP_emb_lime.ipynb 9.62 KB
Newer Older
1
2
3
4
{
 "cells": [
  {
   "cell_type": "code",
5
   "execution_count": 1,
6
7
   "metadata": {},
   "outputs": [],
8
   "source": [
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    "import numpy as np\n",
    "import pandas as pd\n",
    "import paths as pt\n",
    "from tools import file_reader, preprocessor, neural_embedder\n",
    "from utility import metrics\n",
    "from sklearn.metrics import accuracy_score, precision_score\n",
    "from sklearn.metrics import recall_score, roc_auc_score\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.model_selection import train_test_split\n",
    "from pandas.api.types import is_string_dtype, is_numeric_dtype\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import tensorflow as tf\n",
    "\n",
    "CASE = \"Complete\"\n",
    "FILENAME = \"complete.csv\"\n",
    "\n",
    "class NetworkCategory:\n",
    "    def __init__(self, alias: str, unique_values: int):\n",
    "        self.alias = alias\n",
    "        self.unique_values = unique_values\n",
    "        self.embedding_size = self.get_embedding_size(unique_values)\n",
    "        \n",
    "    def get_embedding_size(self, unique_values: int) -> int:\n",
    "        size = int(min(np.ceil(unique_values / 2), 50))\n",
    "        if size < 2:\n",
    "            return 2\n",
    "        else:\n",
    "            return size\n",
    "\n",
    "def transpose_to_list(X):\n",
    "    features_list = []\n",
    "    for index in range(X.shape[1]):\n",
    "        features_list.append(X[..., [index]])\n",
    "\n",
    "    return features_list\n",
    "\n",
    "def ginic(actual, pred):\n",
    "    n = len(actual)\n",
    "    a_s = actual[np.argsort(pred)]\n",
    "    a_c = a_s.cumsum()\n",
    "    giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0\n",
    "    return giniSum / n\n",
    " \n",
    "def gini_normalizedc(a, p):\n",
    "    return ginic(a, p) / ginic(a, a)\n",
    "\n",
    "def get_categorial_cols(df, target_name):\n",
    "    cat_list = []\n",
    "    for category in df:\n",
    "        if not category == target_name and is_string_dtype(df[category]):\n",
    "            cat_list.append(NetworkCategory(category, df[category].nunique()))\n",
    "    return cat_list\n",
    "\n",
    "def get_numerical_cols(df, target_name):\n",
    "    num_list = []\n",
    "    for category in df:\n",
    "        if not category == target_name and is_numeric_dtype(df[category]):\n",
    "            num_list.append(category)\n",
    "    return num_list\n",
    "\n",
    "def build_embedding_network(cat_cols, num_cols):    \n",
    "    # Make numerical layers\n",
    "    numerical_inputs = []\n",
    "    numerical_outputs = []\n",
    "    \n",
    "    for category in num_cols:\n",
    "        input_category = tf.keras.layers.Input(shape=(1,))\n",
    "        output_category = tf.keras.layers.Dense(1, name=category)(input_category)\n",
    "        \n",
    "        numerical_inputs.append(input_category)\n",
    "        numerical_outputs.append(output_category)\n",
    "        \n",
    "    # Make embedding layers\n",
    "    embedding_inputs = []\n",
    "    embedding_outputs = []\n",
    "    \n",
    "    for category in cat_cols:\n",
    "        input_category = tf.keras.layers.Input(shape=(1,))\n",
    "        output_category = tf.keras.layers.Embedding(input_dim=category.unique_values,\n",
    "                                    output_dim=category.embedding_size,\n",
    "                                    name=category.alias)(input_category)\n",
    "        output_category = tf.keras.layers.Reshape(target_shape=(category.embedding_size,))(output_category)\n",
    "\n",
    "        embedding_inputs.append(input_category)\n",
    "        embedding_outputs.append(output_category)\n",
    "    \n",
    "    # Concatenate layers\n",
    "    model_inputs = numerical_inputs + embedding_inputs\n",
    "    model_outputs = numerical_outputs + embedding_outputs\n",
    "    \n",
    "    # Make hidden layers\n",
    "    output_model = tf.keras.layers.Concatenate()(model_outputs)\n",
    "    layer_sizes = [80, 20, 10]\n",
    "    dropout_rates = [.35, .15, .15]\n",
    "    for layer_size, dropout_rate in zip(layer_sizes, dropout_rates):\n",
    "        output_model = tf.keras.layers.Dense(layer_size)(output_model)\n",
    "        output_model = tf.keras.layers.Activation(\"relu\")(output_model)\n",
    "        output_model = tf.keras.layers.Dropout(dropout_rate)(output_model)\n",
    "    \n",
    "    # Make final layer\n",
    "    output_model = tf.keras.layers.Dense(1)(output_model)\n",
    "    output_model = tf.keras.layers.Activation('sigmoid')(output_model)\n",
    "\n",
    "    metrics = [\n",
    "      tf.keras.metrics.BinaryAccuracy(name='accuracy'),\n",
    "      tf.keras.metrics.Precision(name='precision'),\n",
    "      tf.keras.metrics.Recall(name='recall'),\n",
    "      tf.keras.metrics.AUC(name='auc'),\n",
    "    ]\n",
    "    model = tf.keras.Model(inputs=model_inputs, outputs=output_model)\n",
    "    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=metrics)\n",
120
    "    return model"
121
   ]
122
123
124
  },
  {
   "cell_type": "code",
125
   "execution_count": 2,
126
   "metadata": {},
127
128
129
130
   "outputs": [
    {
     "data": {
      "text/plain": [
131
       "<tensorflow.python.keras.callbacks.History at 0x1d90e26a3a0>"
132
133
      ]
     },
134
     "execution_count": 2,
135
     "metadata": {},
136
     "output_type": "execute_result"
137
138
    }
   ],
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
   "source": [
    "ats_cols = {str(i)+'Ats':str for i in range(1, 10+1)}\n",
    "df = file_reader.read_csv(pt.PROCESSED_DATA_DIR,\n",
    "                          FILENAME,\n",
    "                          converters=ats_cols)\n",
    "    \n",
    "emb_cols = df.filter(regex='((\\d+)[Ats])\\w+', axis=1)\n",
    "n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1\n",
    "\n",
    "# Collect embedded and numerical cols\n",
    "cat_cols = get_categorial_cols(df, CASE)\n",
    "num_cols = get_numerical_cols(df, CASE)\n",
    "\n",
    "# Prepare the data\n",
    "X, y = preprocessor.get_X_y(df, CASE)\n",
    "X, labels = preprocessor.encode_vector_label(X, n_numerical_cols)\n",
    "y = np.array(y)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X_train_sc = scaler.fit_transform(X_train[:,:n_numerical_cols])\n",
    "X_test_sc = scaler.transform(X_test[:,:n_numerical_cols])\n",
    "X_train = np.concatenate([X_train_sc, X_train[:,n_numerical_cols:]], axis=1)\n",
    "X_test = np.concatenate([X_test_sc, X_test[:,n_numerical_cols:]], axis=1)\n",
    "\n",
    "# Network training\n",
    "model = build_embedding_network(cat_cols, num_cols)\n",
    "model.fit(transpose_to_list(X), y, epochs=10, batch_size=32, verbose=False)"
   ]
168
169
170
  },
  {
   "cell_type": "code",
171
   "execution_count": 3,
172
   "metadata": {},
173
174
175
   "outputs": [
    {
     "name": "stdout",
176
     "output_type": "stream",
177
178
179
180
181
182
183
184
185
186
187
     "text": [
      "[[0 1]\n",
      " [0 1]\n",
      " [0 1]\n",
      " ...\n",
      " [0 1]\n",
      " [0 1]\n",
      " [0 1]]\n"
     ]
    }
   ],
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
   "source": [
    "def prob(data):\n",
    "    global model\n",
    "    y_pred = model.predict(transpose_to_list(data)).reshape(-1, 1)\n",
    "    y_pred = (y_pred>0.5)\n",
    "    print(np.array(list(zip(1-y_pred.reshape(data.shape[0]),y_pred.reshape(data.shape[0])))))\n",
    "    return np.hstack((1-y_pred,y_pred))\n",
    "\n",
    "import lime\n",
    "import lime.lime_tabular\n",
    "features = list(df.columns)\n",
    "features.remove('Complete')\n",
    "explainer = lime.lime_tabular.LimeTabularExplainer(X_train, mode='classification',\n",
    "                                                   class_names=['No complete', 'Complete'],\n",
    "                                                   feature_names=features)\n",
    "exp = explainer.explain_instance(X_test[27], prob, num_features=X_train.shape[1])"
   ]
205
206
207
  },
  {
   "cell_type": "code",
208
   "execution_count": 4,
209
   "metadata": {},
210
211
212
213
   "outputs": [
    {
     "data": {
      "text/plain": [
214
       "array([[0.60240066]], dtype=float32)"
215
216
      ]
     },
217
     "execution_count": 4,
218
     "metadata": {},
219
     "output_type": "execute_result"
220
221
    }
   ],
222
223
224
   "source": [
    "model.predict(transpose_to_list(np.array([X_test[27],])))"
   ]
225
226
227
  },
  {
   "cell_type": "code",
228
   "execution_count": 5,
229
   "metadata": {},
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('-0.77 < Gender_Male <= 1.30', 0.0),\n",
       " ('Gender_Female <= -1.30', 0.0),\n",
       " ('BirthYear <= -0.67', 0.0),\n",
       " ('-1.00 < Cluster <= 0.01', 0.0),\n",
       " ('LoanPeriod > 0.47', 0.0),\n",
       " ('-0.68 < NumberAts <= -0.28', 0.0),\n",
       " ('1Ats <= 5.00', 0.0),\n",
       " ('2Ats <= 6.00', 0.0),\n",
       " ('3Ats > 31.00', 0.0),\n",
       " ('4Ats > 32.00', 0.0),\n",
       " ('18.00 < 5Ats <= 32.00', 0.0),\n",
       " ('18.00 < 6Ats <= 33.00', 0.0),\n",
       " ('7Ats <= 0.00', 0.0),\n",
       " ('8Ats <= 0.00', 0.0),\n",
       " ('9Ats <= 0.00', 0.0),\n",
       " ('10Ats <= 0.00', 0.0)]"
      ]
     },
252
     "execution_count": 5,
253
     "metadata": {},
254
     "output_type": "execute_result"
255
256
    }
   ],
257
258
259
   "source": [
    "exp.as_list()"
   ]
260
261
262
  }
 ],
 "metadata": {
263
264
265
266
267
268
269
270
  "interpreter": {
   "hash": "59ff6fbb0321898508cf6243593820bf2585fcfb6693fd00e85ec94ed8847fd0"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
271
272
273
274
275
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
276
277
278
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
279
   "nbconvert_exporter": "python",
280
281
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
282
283
284
285
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
286
}