"""
preprocessor.py
====================================
Preprocessor to prepare data for models.
"""
from typing import List, Tuple
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import numpy as np
from scipy.stats import skew, boxcox
[docs]def series_to_list(series: pd.Series) -> List:
"""
This method is used to convert a given pd.Series object into a list
:param series: the list to be converted
:return: the list containing all the elements from the Series object
"""
list_cols = []
for item in series:
list_cols.append(item)
return list_cols
[docs]def sample(X: np.ndarray, y: np.ndarray, n: int) -> Tuple[np.ndarray,
np.ndarray]:
"""
This method is used to sample a random number of N rows betwen [0, X.shape[0]]
:param X: the X array to sample from
:param y: the y array to sample from
:param n: the number of samples
:return: the tuple containing a subset of samples in X and y
"""
num_row = X.shape[0]
indices = np.random.randint(num_row, size=n)
return X[indices, :], y[indices]
[docs]def get_X_y(df: pd.DataFrame, name_target: str) -> Tuple[List, List]:
"""
This method is used to gather the X (features) and y (targets)
from a given dataframe based on a given target name
:param df: the dataframe to be used as source
:param name_target: the name of the target variable
:return: the list of features and targets
"""
X_list = []
y_list = []
for _, record in df.iterrows():
fl = series_to_list(record.drop(name_target))
X_list.append(fl)
y_list.append(int(record[name_target]))
return X_list, y_list
[docs]def transpose_to_list(X: np.ndarray) -> List[np.ndarray]:
"""
This method is used to convert the nd.array X (features)
to a list of features
:param X: the ndarray to be used as source
:return: a list of nd.array containing the elements from the numpy array
"""
features_list = []
for index in range(X.shape[1]):
features_list.append(X[..., [index]])
return features_list
[docs]def get_ats_list(ats: pd.DataFrame) -> pd.DataFrame:
"""
This method groups ats by CitizenId and returns
the result as a single column dataframe
:param ats: the dataframe with associated CitizenId and DevISOClass column
:return: the dataframe with the grouped ats
"""
df = pd.DataFrame(ats.groupby(['CitizenId'])['DevISOClass'].apply(",".join)).reset_index()
df = df.rename(columns={'DevISOClass': 'Ats'})
return df
[docs]def get_class_weight(neg: int, pos: int) -> dict:
"""
This method computes the class weight for a
classification problem given the number of
negative and positive labels
:param neg: number of negative labels
:param pos: number of positive labels
:return: the class weight as a dictionary
"""
total = neg + pos
weight_for_0 = (1 / neg)*(total)/2.0
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
return class_weight
[docs]def prepare_data_for_emb(df: pd.DataFrame,
target_name: str,
train_ratio: float,
n_num_cols: int = None) -> Tuple[np.ndarray, np.ndarray,
np.ndarray, np.ndarray,
List[LabelEncoder]]:
"""
This method prepares data in dataframe for a
neural embedder by extracting the X, y variables,
label encoding the categorial variables and splitting
the data into a train and test set given a split ratio.
Finally it returns the split and the encoded labels
:param df: dataframe containing the X and y values
:param target_name: name of target label
:param train_ratio: the split ratio
:param n_num_cols: number of numerical columns in dataframe
:return: the split and the encoded labels
"""
X, y = get_X_y(df, target_name)
X, labels = encode_vector_label(X, n_num_cols)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=train_ratio,
random_state=0)
return X_train, X_test, y_train, y_test, labels
[docs]def normalize_data(df: pd.DataFrame, feature_names: List[str]) -> pd.DataFrame:
"""
This method normalizes data in dataframe
:param df: dataframe containing the data
:param feature_names: names of features to normalize
:return: a dataframe with normalized features
"""
skewed_feats = df[feature_names].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.25]
skewed_feats = skewed_feats.index
for feats in skewed_feats:
df[feats] = df[feats] + 1
df[feats], _ = boxcox(df[feats])
return df
[docs]def scale_data(df: pd.DataFrame, feature_names: List[str]) -> pd.DataFrame:
"""
This method scales data in dataframe
:param df: dataframe containing the data
:param feature_names: names of features to scale
:return: a dataframe with scaled features
"""
scaler = StandardScaler()
df[feature_names] = scaler.fit_transform(df[feature_names])
return df
[docs]def one_hot_encode(df: pd.DataFrame, feature_names: List[str]) -> pd.DataFrame:
"""
This method one-hot-encodes data in a dataframe
:param df: dataframe containing the data
:param feature_names: names of features to one-hot-encode
:return: a dataframe with one-hot-encoded features
"""
ohe = OneHotEncoder(sparse=False)
df_enc = pd.DataFrame(ohe.fit_transform(df[feature_names].astype(str)))
df_enc.columns = ohe.get_feature_names(feature_names)
df_enc.index = df.index
return df_enc
[docs]def split_cat_columns(df: pd.DataFrame, col_to_split: str,
tag: str, resolution: int) -> pd.DataFrame:
"""
This method splits a categorial column by a resolution
:param df: dataframe containing the data
:param col_to_split: name of column to split
:param tag: name of tag for new columns
:param resolution: the resolution to use
:return: a dataframe with splitted columns
"""
split = pd.DataFrame(df[col_to_split].str.split(pat=",", expand=True))
split = split.drop(split.iloc[:, resolution:], axis=1)
split = split.fillna(0)
df = pd.concat([df, split], axis=1)
df = df.drop(col_to_split, axis=1)
for i in range(0, resolution):
try:
df = df.rename(columns={i: f'{i+1}{tag}'})
except KeyError:
pass
return df
[docs]def encode_vector_label(data: List[np.ndarray],
n_num_cols: int = None) -> Tuple[List[np.ndarray],
List[LabelEncoder]]:
"""
This method label-encodes categorial data
:param data: a List of data NumPy arrays
:param n_num_cols: number of numerial columns to skip
:return: a Tuple of Lists with encoded data and encoders
"""
encoders = []
data_encoded = np.array(data)
if n_num_cols is None:
for i in range(data_encoded.shape[1]):
le = LabelEncoder()
le.fit(data_encoded[:, i])
encoders.append(le)
data_encoded[:, i] = le.transform(data_encoded[:, i])
else:
for i in range(n_num_cols, data_encoded.shape[1]):
le = LabelEncoder()
le.fit(data_encoded[:, i])
encoders.append(le)
data_encoded[:, i] = le.transform(data_encoded[:, i])
data_encoded = data_encoded.astype(float)
return data_encoded, encoders
[docs]def replace_cat_values(df: pd.DataFrame,
mapping: pd.DataFrame) -> pd.DataFrame:
"""
This method replaces categorial values in dataframe
by their real-world counterpart given a mapping
:param df: a dataframe with the values to replace
:param mapping: the mapping to use
:return: a dataframe where categorial values have been replaced
"""
cat_dict = dict(mapping.values)
df = df.replace(to_replace=cat_dict)
return df