Commit 785379cb authored by thecml's avatar thecml
Browse files

added test for neural emb

parent 950f302d
Pipeline #94199 failed with stage
in 4 minutes and 30 seconds
......@@ -57,9 +57,9 @@ def main(ats_resolution: int = None):
def encode_dataframe(df, target_name, batch_size, train_ratio, epochs,
optimizer, network_layers, verbose, model_path):
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_embedder(df,
target_name,
train_ratio)
X_train, X_val, y_train, y_val, labels = preprocessor.prepare_data_for_emb(df,
target_name,
train_ratio)
network = neural_embedder.NeuralEmbedder(df=df, target_name=target_name,
epochs=epochs, batch_size=batch_size,
network_layers=network_layers,
......
import numpy as np
import pandas as pd
from tensorflow.python.data.ops.dataset_ops import normalize_to_dense
import paths as pt
from abc import ABC, abstractmethod
from tools import file_reader, preprocessor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.stats import skew, boxcox
from typing import Tuple, List
from pathlib import Path
from io import BytesIO, StringIO
......
......@@ -97,12 +97,12 @@ def get_class_weight(neg: int, pos: int) -> dict:
class_weight = {0: weight_for_0, 1: weight_for_1}
return class_weight
def prepare_data_for_embedder(df: pd.DataFrame,
target_name: str,
train_ratio: float,
n_num_cols: int = None) -> Tuple[np.ndarray, np.ndarray,
np.ndarray, np.ndarray,
List[LabelEncoder]]:
def prepare_data_for_emb(df: pd.DataFrame,
target_name: str,
train_ratio: float,
n_num_cols: int = None) -> Tuple[np.ndarray, np.ndarray,
np.ndarray, np.ndarray,
List[LabelEncoder]]:
"""
This method prepares data in dataframe for a
neural embedder by extracting the X, y variables,
......
Gender,BirthYear,LoanPeriod,NumberAts,1Ats,2Ats,3Ats,4Ats,5Ats,6Ats,7Ats,8Ats,9Ats,10Ats,Complete
0,59,326,12,181810,120606,123103,093307,093307,120603,093307,091203,091203,120603,0
0,59,364,26,181810,120606,123103,093307,093307,120603,093307,091203,091203,120603,1
0,59,490,42,181810,120606,123103,093307,093307,120603,093307,091203,091203,120603,0
0,36,881,5,120606,120606,222718,091203,093307,0,0,0,0,0,1
0,36,928,5,120606,120606,222718,091203,093307,0,0,0,0,0,1
0,37,579,5,120606,093307,222718,222718,091209,0,0,0,0,0,0
1,40,3863,49,122303,122203,181006,120316,120316,181003,120316,120316,181006,181003,0
0,54,1166,17,120606,120606,120724,122203,043303,120606,180903,043609,242103,093307,0
0,54,1203,29,120606,120606,120724,122203,043303,120606,180903,043609,242103,093307,0
0,29,0,0,0,0,0,0,0,0,0,0,0,0,1
0,29,0,0,0,0,0,0,0,0,0,0,0,0,1
0,29,0,0,0,0,0,0,0,0,0,0,0,0,0
1,29,586,2,093307,120606,0,0,0,0,0,0,0,0,0
0,32,2017,9,120606,120306,120306,120606,242103,181210,043306,091218,093307,0,0
0,37,1647,10,221803,120606,091218,150303,222712,123903,180903,222718,222718,093307,1
0,37,1724,10,221803,120606,091218,150303,222712,123903,180903,222718,222718,093307,1
1,35,0,0,0,0,0,0,0,0,0,0,0,0,0
0,45,353,14,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,406,14,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,490,14,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,789,15,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,681,20,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,765,21,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,877,22,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,727,37,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
0,45,829,37,123103,120606,120606,120606,222718,091218,123109,091203,091233,093307,1
1,46,261,3,122203,043303,122203,0,0,0,0,0,0,0,1
1,46,305,3,122203,043303,122203,0,0,0,0,0,0,0,1
1,46,361,7,122203,043303,122203,091218,043303,222718,222718,0,0,0,0
0,44,0,0,0,0,0,0,0,0,0,0,0,0,0
0,51,83,10,222718,222718,093307,120606,091218,120724,091203,043303,093307,122203,1
0,51,132,11,222718,222718,093307,120606,091218,120724,091203,043303,093307,122203,1
0,51,282,11,222718,222718,093307,120606,091218,120724,091203,043303,093307,122203,0
0,51,489,11,222718,222718,093307,120606,091218,120724,091203,043303,093307,122203,0
0,51,883,11,222718,222718,093307,120606,091218,120724,091203,043303,093307,122203,1
0,51,932,11,222718,222718,093307,120606,091218,120724,091203,043303,093307,122203,1
1,52,1915,13,222704,222403,120606,120606,120316,093307,091218,120606,181210,043306,1
1,52,1977,13,222704,222403,120606,120606,120316,093307,091218,120606,181210,043306,1
0,36,2897,5,242103,090903,242103,221803,120606,0,0,0,0,0,1
0,36,2981,5,242103,090903,242103,221803,120606,0,0,0,0,0,1
1,32,942,12,221803,221803,123109,043306,181021,123103,181210,181228,222718,123103,0
1,35,70,6,120606,093307,120606,222718,222718,091209,0,0,0,0,0
0,35,926,9,120606,181003,091218,093307,222718,122203,043303,091203,091218,0,0
0,46,1609,1,120606,0,0,0,0,0,0,0,0,0,0
0,21,2412,21,242103,242103,120306,120606,120727,120606,093307,091218,120606,120724,0
0,50,90,21,091203,122203,043303,183015,183015,181210,043306,091203,123603,123621,1
0,50,153,21,091203,122203,043303,183015,183015,181210,043306,091203,123603,123621,1
1,32,105,7,093307,181210,043306,120606,043303,120606,043303,0,0,0,1
1,32,153,8,093307,181210,043306,120606,043303,120606,043303,043303,0,0,1
1,32,146,10,093307,181210,043306,120606,043303,120606,043303,043303,181218,043306,1
1,32,185,10,093307,181210,043306,120606,043303,120606,043303,043303,181218,043306,1
1,32,201,11,093307,181210,043306,120606,043303,120606,043303,043303,181218,043306,1
1,32,333,13,093307,181210,043306,120606,043303,120606,043303,043303,181218,043306,1
1,32,396,13,093307,181210,043306,120606,043303,120606,043303,043303,181218,043306,1
1,32,548,16,093307,181210,043306,120606,043303,120606,043303,043303,181218,043306,0
0,42,0,0,0,0,0,0,0,0,0,0,0,0,1
0,42,0,0,0,0,0,0,0,0,0,0,0,0,1
0,24,2281,4,222704,222403,120606,093307,0,0,0,0,0,0,0
0,32,0,0,0,0,0,0,0,0,0,0,0,0,0
0,28,677,2,242103,093307,0,0,0,0,0,0,0,0,0
0,36,1444,10,120606,120727,120606,120724,222718,091218,091203,181006,222718,180903,1
0,36,1540,10,120606,120727,120606,120724,222718,091218,091203,181006,222718,180903,1
1,33,0,0,0,0,0,0,0,0,0,0,0,0,1
1,33,0,0,0,0,0,0,0,0,0,0,0,0,1
1,33,0,0,0,0,0,0,0,0,0,0,0,0,0
0,33,1104,13,120606,122203,043303,120606,120724,120606,120606,222718,222718,181210,1
0,33,1108,14,120606,122203,043303,120606,120724,120606,120606,222718,222718,181210,1
0,45,830,22,120606,090903,242103,091212,120306,120306,093307,181006,090906,122203,0
0,45,791,39,120606,090903,242103,091212,120306,120306,093307,181006,090906,122203,0
0,37,322,24,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,1
0,37,348,24,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,1
0,37,711,30,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,0
0,37,1086,39,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,1
0,37,1149,39,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,1
0,37,1200,39,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,1
0,37,1239,40,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,1
0,37,1275,40,120606,120606,093307,222718,043306,181218,122218,043303,123621,123621,0
1,50,22,1,093307,0,0,0,0,0,0,0,0,0,0
0,32,668,4,093307,120606,122203,043303,0,0,0,0,0,0,1
0,32,742,4,093307,120606,122203,043303,0,0,0,0,0,0,1
0,28,0,0,0,0,0,0,0,0,0,0,0,0,0
1,50,2218,17,120606,242103,120606,122203,043303,122303,091218,043303,120306,120306,1
1,50,1779,22,120606,242103,120606,122203,043303,122303,091218,043303,120306,120306,1
0,35,1436,7,091203,091233,120606,093307,091203,093307,091218,0,0,0,1
0,35,1467,7,091203,091233,120606,093307,091203,093307,091218,0,0,0,1
0,46,182,29,093307,222718,222718,120606,043306,043303,123112,123103,123103,123103,0
1,37,0,0,0,0,0,0,0,0,0,0,0,0,1
1,37,0,0,0,0,0,0,0,0,0,0,0,0,1
1,37,0,0,0,0,0,0,0,0,0,0,0,0,1
1,37,0,0,0,0,0,0,0,0,0,0,0,0,1
1,32,0,0,0,0,0,0,0,0,0,0,0,0,1
1,32,0,0,0,0,0,0,0,0,0,0,0,0,1
1,32,0,0,0,0,0,0,0,0,0,0,0,0,1
1,32,0,0,0,0,0,0,0,0,0,0,0,0,1
1,32,0,0,0,0,0,0,0,0,0,0,0,0,0
1,32,0,0,0,0,0,0,0,0,0,0,0,0,0
1,32,869,3,120606,120606,093307,0,0,0,0,0,0,0,0
1,43,1609,3,222712,120606,093307,0,0,0,0,0,0,0,0
0,23,1038,8,242103,242103,093307,120606,093307,093307,091218,242103,0,0,1
0,23,1117,8,242103,242103,093307,120606,093307,093307,091218,242103,0,0,1
\ No newline at end of file
......@@ -2,5 +2,6 @@
# Settings for test -------------------------------------------------
#
ats_resolution: 10
features_to_normalize: ['BirthYear']
features_to_scale: ['BirthYear']
\ No newline at end of file
import pandas as pd
import numpy as np
from tools import file_reader, neural_embedder, preprocessor
from utility.settings import load_settings
from io import StringIO
from pathlib import Path
import paths as pt
import shutil
def test_fit():
infile = StringIO()
settings = load_settings(pt.TESTS_FILES_DIR, "test_config.yaml")
file_path = pt.TESTS_FILES_DIR
file_name = "Complete.csv"
ats_resolution = settings['ats_resolution']
converters = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
with open(Path.joinpath(file_path, file_name), 'r', newline='') as fd:
shutil.copyfileobj(fd, infile)
infile.seek(0)
df = file_reader.read_csv(infile, converters=converters)
emb_cols = df.filter(regex='((\d+)[Ats])\w+', axis=1)
n_numerical_cols = df.shape[1] - emb_cols.shape[1] - 1
df_to_enc = df.iloc[:,n_numerical_cols:]
X_train, X_val, y_train, y_val, _ = preprocessor.prepare_data_for_emb(df_to_enc,
"Complete",
0.7)
network = neural_embedder.NeuralEmbedder(df=df_to_enc, target_name="Complete")
history = network.fit(X_train, y_train, X_val, y_val)
assert len(history.history['accuracy']) == 10
\ No newline at end of file
......@@ -42,7 +42,7 @@ def test_prepare_data_for_embedder():
df = pd.DataFrame()
df['Age'] = np.random.randint(0, 5, 10)
df['Complete'] = np.random.randint(0, 1, 10)
xt, xv, yt, yv, lbl = preprocessor.prepare_data_for_embedder(df, 'Complete', 0.9)
xt, xv, yt, yv, lbl = preprocessor.prepare_data_for_emb(df, 'Complete', 0.9)
assert len(xt) == 9
assert len(xv) == 1
assert len(yt) == 9
......
import pandas as pd
from utility import dataset
def test_create_union_of_ids():
citizen_ids = pd.DataFrame([i*i for i in range(100)], columns=['CitizenId'])
citizen_ids2 = pd.DataFrame([i*2 for i in range(100)], columns=['CitizenId'])
ids = dataset.create_union_of_ids(citizen_ids,citizen_ids2)
assert len(ids) != 0
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment