Commit 60e89abf authored by thecml's avatar thecml
Browse files

fixed unit tests

parent 1d245811
Pipeline #94712 passed with stage
in 4 minutes and 57 seconds
# Settings for data loader -------------------------------------------------
#
features_to_normalize: ['BirthYear', 'LoanPeriod', 'NumberAts']
features_to_scale: ['Gender', 'BirthYear', 'LoanPeriod', 'NumberAts']
\ No newline at end of file
This diff is collapsed.
......@@ -57,7 +57,7 @@ class BoxCoxNormalizerNoGender(BaseEstimator, TransformerMixin):
skewed_feats = skewed_feats.index
for feats in skewed_feats:
X[feats] = X[feats] + 1
X[feats], lam = boxcox(X[feats])
X[feats], _ = boxcox(X[feats])
return np.array(X)
def main():
......
......@@ -114,6 +114,9 @@ def main():
# Drop citizen id
x_df = x_df.drop('CitizenId', axis=1)
y_df = y_df.drop('CitizenId', axis=1)
# Encode gender
x_df['Gender'] = x_df['Gender'].apply(lambda x: 0 if x == "FEMALE" else 1)
# Prepare data y and x
ats_resolution = settings['ats_resolution']
......
#!/usr/bin/env python
import pandas as pd
import paths as pt
from pathlib import Path
from tools import file_reader, file_writer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tools import file_writer, data_loader, preprocessor
from sklearn.preprocessing import LabelEncoder
from utility.settings import load_settings
from sklearn.model_selection import train_test_split
from sksurv.ensemble import RandomSurvivalForest
from io import StringIO, BytesIO
from io import BytesIO
import shutil
USE_LABEL_ENC = True
from sklearn.model_selection import train_test_split
def main():
settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
ats_resolution = settings['ats_resolution']
with open(Path.joinpath(pt.PROCESSED_DATA_DIR, "alarm_data.pkl"), 'rb') as fd:
infile = BytesIO()
shutil.copyfileobj(fd, infile)
infile.seek(0)
data = file_reader.read_pickle(infile)
data_x = data['x']
data_y = data['y']
data_settings = load_settings(pt.CONFIGS_DIR, "data.yaml")
target_settings = load_settings(pt.CONFIGS_DIR, "alarm.yaml")
ats_resolution = data_settings['ats_resolution']
if USE_LABEL_ENC:
labels_enc = dict()
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
for col_name in ['Gender'] + ats_cols:
le = LabelEncoder()
le.fit(data_x.loc[:, col_name].astype(str))
labels_enc[col_name] = le
data_x.loc[:, col_name] = le.transform(data_x.loc[:, col_name].astype(str))
else:
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
object_cols = ['Gender'] + ats_cols
ohe = OneHotEncoder(sparse=False)
df_enc = pd.DataFrame(ohe.fit_transform(data_x[object_cols].astype(str)))
df_enc.columns = ohe.get_feature_names(object_cols)
df_enc.index = data_x.index
data_x = pd.concat([data_x.drop(object_cols, axis=1), df_enc], axis=1)
dl = data_loader.AlarmDataLoader(pt.PROCESSED_DATA_DIR,
"alarm_data.pkl",
target_settings).load_data()
X, y = dl.get_data()
labels_enc = dict()
ats_cols = [f"{i}Ats" for i in range(1, ats_resolution+1)]
for col_name in ats_cols:
le = LabelEncoder()
le.fit(X.loc[:, col_name].astype(str))
labels_enc[col_name] = le
X.loc[:, col_name] = le.transform(X.loc[:, col_name].astype(str))
model = RandomSurvivalForest(n_estimators=200, max_depth=6,
n_jobs=-1, random_state=0)
model.fit(data_x, data_y)
model.fit(X, y)
with open(Path.joinpath(pt.MODELS_DIR, "alarm_labels.pkl"), 'wb') as fd:
outfile = BytesIO()
......
......@@ -12,8 +12,7 @@ def read_csv(infile: StringIO, header: str='infer',
names: List[str]=None, converters: dict=None) -> pd.DataFrame:
"""
This method reads a csv file using Pandas read_csv() method
:param file_path: path of the file
:param file_name: name of the file
:param infile: text input stream
:param header: file header
:param sep: seperator identifier
:param names: list of column names to use
......@@ -25,9 +24,8 @@ def read_csv(infile: StringIO, header: str='infer',
def read_embedding(infile: StringIO) -> dict:
"""
This method reads an embedding file from disk
:param file_path: the path of the file
:param file_name: the name of the file
This method reads an embedding file
:param infile: text input stream
:return: the embedding as a dict
"""
reader = csv.reader(infile)
......@@ -36,9 +34,8 @@ def read_embedding(infile: StringIO) -> dict:
def read_array(infile: BytesIO) -> np.ndarray:
"""
This method reads an NumPy array file as a pickle from disk
:param file_path: the path of the file
:param file_name: the name of the file
This method reads an NumPy array file as a pickle
:param infile: binary input stream
:return: the NumPy array object
"""
return np.load(infile)
......@@ -46,8 +43,7 @@ def read_array(infile: BytesIO) -> np.ndarray:
def read_pickle(infile: BytesIO) -> any:
"""
This method reads any file stored as a pickle
:param file_path: the path of the file
:param file_name: the name of the file
:param infile: binary input stream
:return: the file object
"""
data = pickle.load(infile)
......@@ -55,18 +51,16 @@ def read_pickle(infile: BytesIO) -> any:
def read_joblib(infile: BytesIO) -> any:
"""
This method reads a joblib file from disk
:param file_path: the path of the file
:param file_name: the name of the file
This method reads a joblib file
:param infile: binary input stream
:return: the joblib file
"""
return joblib.load(infile)
def read_excelfile(infile: BytesIO, converters: dict=None) -> pd.DataFrame:
"""
This method reads an excel file from disk
:param file_path: path of the file
:param file_name: name of the file
This method reads an excel file
:param infile: binary input stream
:param converters: dict of converters to use
:return: the excel file as a dataframe
"""
......@@ -78,9 +72,8 @@ def read_excelfile(infile: BytesIO, converters: dict=None) -> pd.DataFrame:
def read_excelfile_sheets(infile: BytesIO, n_sheets: int,
converters: dict=None) -> pd.DataFrame:
"""
This method reads sheets from an excel file from disk
:param file_path: path of the file
:param file_name: name of the file
This method reads sheets from an excel file
:param infile: binary input stream
:param n_sheets: number of sheets to read
:param converters: dict of converters to use
:return: the full excel file as a dataframe
......
......@@ -18,8 +18,7 @@ def write_csv(df: pd.DataFrame,
"""
This method writes a csv file to disk using Pandas to_csv() method
:param df: dataframe to write
:param file_path: path of the file
:param file_name: name of the file
:param outfile: text output stream
:param date_format: data format to use
:param index: write row names (index)
:return: None
......@@ -30,8 +29,7 @@ def write_embedding(mapping: dict, outfile: StringIO) -> None:
"""
This method writes an embedding mapping to disk as a csv file
:param mapping: mapping dict
:param file_path: path of the file
:param file_name: name of the file
:param outfile: text output stream
:return: None
"""
field_names = ['Ats', 'Embedding']
......@@ -44,8 +42,7 @@ def write_array(data: np.ndarray, outfile: BytesIO) -> None:
"""
This method writes an NumPy array to disk
:param data: data to write
:param file_path: path of the file
:param file_name: name of the file
:param outfile: binary output stream
:return: None
"""
np.save(outfile, data)
......@@ -54,8 +51,7 @@ def write_pickle(data: any, outfile: BytesIO) -> None:
"""
This method writes a pickle file to disk
:param data: data to write
:param file_path: path of the file
:param file_name: name of the file
:param outfile: binary output stream
:return: None
"""
pickle.dump(data, outfile)
......@@ -64,8 +60,7 @@ def write_joblib(data: any, outfile: BytesIO) -> None:
"""
This method writes a joblib file to disk
:param data: data to write
:param file_path: path of the file
:param file_name: name of the file
:param outfile: binary output stream
:return: None
"""
joblib.dump(data, outfile)
......@@ -78,8 +73,7 @@ def write_shap_importance_plot(features: List[str],
This method writes a SHAP importance plot to disk
:param features: feature names
:param importances: feature importances
:param file_path: path of the file
:param file_name: name of the file
:param outfile: binary output stream
:return: None
"""
plt.close()
......@@ -99,9 +93,9 @@ def write_cv_plot(means: List, stds: List, metric: str,
:param metric: the metric used
:param num_iter: the number of iterations
:param clf_names: names of classifiers used
:param file_path: path of the file
:param file_name: name of the file
:param case_subtitle: subtitle of the case
:param title: plot title
:param subtitle: plot subtitle
:param outfile: binary output stream
:return: None
"""
plt.close()
......@@ -122,10 +116,10 @@ def write_roc_curve(y_true: np.ndarray, results: List,
"""
This method write a ROC curve to disk
:param y_true: the true labels
:param results: a list with results
:param file_path: path of the file
:param file_name: name of the file
:param case_subtitle: subtitle of the case
:param results: a list with results
:param title: plot title
:param subtitle: plot subtitle
:param outfile: binary output stream
:return: None
"""
plt.close()
......@@ -146,9 +140,9 @@ def write_cm_plot(y_true: np.ndarray, y_pred: np.ndarray,
This method write a confusion matrix plot to disk
:param y_true: the true labels
:param y_pred: the predicted labels
:param file_path: path of the file
:param file_name: name of the file
:param case_subtitle: subtitle of the case
:param title: plot title
:param subtitle: plot subtitle
:param outfile: binary output stream
:return: None
"""
plt.close()
......
......@@ -11,7 +11,7 @@ def test_write_csv():
file_writer.write_csv(df, outfile)
outfile.seek(0)
content = outfile.read()
assert content == 'col1,col2\r\n1,3\r\n2,4\r\n'
assert len(content) > 0
def test_write_mapping():
mapping = {'0': -0.014424355, '043303': -0.06444023}
......
......@@ -11,7 +11,7 @@ def test_fit():
infile = StringIO()
settings = load_settings(pt.TESTS_FILES_DIR, "test_config.yaml")
file_path = pt.TESTS_FILES_DIR
file_name = "Complete.csv"
file_name = "complete.csv"
ats_resolution = settings['ats_resolution']
converters = {str(i)+'Ats':str for i in range(1, ats_resolution+1)}
with open(Path.joinpath(file_path, file_name), 'r', newline='') as fd:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment