Source code for tools.preprocessor

"""
preprocessor.py
====================================
Preprocessor to prepare data for models.
"""

from typing import List, Tuple
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import numpy as np
from scipy.stats import skew, boxcox

[docs]def series_to_list(series: pd.Series) -> List: """ This method is used to convert a given pd.Series object into a list :param series: the list to be converted :return: the list containing all the elements from the Series object """ list_cols = [] for item in series: list_cols.append(item) return list_cols
[docs]def sample(X: np.ndarray, y: np.ndarray, n: int) -> Tuple[np.ndarray, np.ndarray]: """ This method is used to sample a random number of N rows betwen [0, X.shape[0]] :param X: the X array to sample from :param y: the y array to sample from :param n: the number of samples :return: the tuple containing a subset of samples in X and y """ num_row = X.shape[0] indices = np.random.randint(num_row, size=n) return X[indices, :], y[indices]
[docs]def get_X_y(df: pd.DataFrame, name_target: str) -> Tuple[List, List]: """ This method is used to gather the X (features) and y (targets) from a given dataframe based on a given target name :param df: the dataframe to be used as source :param name_target: the name of the target variable :return: the list of features and targets """ X_list = [] y_list = [] for _, record in df.iterrows(): fl = series_to_list(record.drop(name_target)) X_list.append(fl) y_list.append(int(record[name_target])) return X_list, y_list
[docs]def transpose_to_list(X: np.ndarray) -> List[np.ndarray]: """ This method is used to convert the nd.array X (features) to a list of features :param X: the ndarray to be used as source :return: a list of nd.array containing the elements from the numpy array """ features_list = [] for index in range(X.shape[1]): features_list.append(X[..., [index]]) return features_list
[docs]def get_ats_list(ats: pd.DataFrame) -> pd.DataFrame: """ This method groups ats by CitizenId and returns the result as a single column dataframe :param ats: the dataframe with associated CitizenId and DevISOClass column :return: the dataframe with the grouped ats """ df = pd.DataFrame(ats.groupby(['CitizenId'])['DevISOClass'].apply(",".join)).reset_index() df = df.rename(columns={'DevISOClass': 'Ats'}) return df
[docs]def get_class_weight(neg: int, pos: int) -> dict: """ This method computes the class weight for a classification problem given the number of negative and positive labels :param neg: number of negative labels :param pos: number of positive labels :return: the class weight as a dictionary """ total = neg + pos weight_for_0 = (1 / neg)*(total)/2.0 weight_for_1 = (1 / pos)*(total)/2.0 class_weight = {0: weight_for_0, 1: weight_for_1} return class_weight
[docs]def prepare_data_for_emb(df: pd.DataFrame, target_name: str, train_ratio: float, n_num_cols: int = None) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[LabelEncoder]]: """ This method prepares data in dataframe for a neural embedder by extracting the X, y variables, label encoding the categorial variables and splitting the data into a train and test set given a split ratio. Finally it returns the split and the encoded labels :param df: dataframe containing the X and y values :param target_name: name of target label :param train_ratio: the split ratio :param n_num_cols: number of numerical columns in dataframe :return: the split and the encoded labels """ X, y = get_X_y(df, target_name) X, labels = encode_vector_label(X, n_num_cols) y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio, random_state=0) return X_train, X_test, y_train, y_test, labels
[docs]def normalize_data(df: pd.DataFrame, feature_names: List[str]) -> pd.DataFrame: """ This method normalizes data in dataframe :param df: dataframe containing the data :param feature_names: names of features to normalize :return: a dataframe with normalized features """ skewed_feats = df[feature_names].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.25] skewed_feats = skewed_feats.index for feats in skewed_feats: df[feats] = df[feats] + 1 df[feats], _ = boxcox(df[feats]) return df
[docs]def scale_data(df: pd.DataFrame, feature_names: List[str]) -> pd.DataFrame: """ This method scales data in dataframe :param df: dataframe containing the data :param feature_names: names of features to scale :return: a dataframe with scaled features """ scaler = StandardScaler() df[feature_names] = scaler.fit_transform(df[feature_names]) return df
[docs]def one_hot_encode(df: pd.DataFrame, feature_names: List[str]) -> pd.DataFrame: """ This method one-hot-encodes data in a dataframe :param df: dataframe containing the data :param feature_names: names of features to one-hot-encode :return: a dataframe with one-hot-encoded features """ ohe = OneHotEncoder(sparse=False) df_enc = pd.DataFrame(ohe.fit_transform(df[feature_names].astype(str))) df_enc.columns = ohe.get_feature_names(feature_names) df_enc.index = df.index return df_enc
[docs]def split_cat_columns(df: pd.DataFrame, col_to_split: str, tag: str, resolution: int) -> pd.DataFrame: """ This method splits a categorial column by a resolution :param df: dataframe containing the data :param col_to_split: name of column to split :param tag: name of tag for new columns :param resolution: the resolution to use :return: a dataframe with splitted columns """ split = pd.DataFrame(df[col_to_split].str.split(pat=",", expand=True)) split = split.drop(split.iloc[:, resolution:], axis=1) split = split.fillna(0) df = pd.concat([df, split], axis=1) df = df.drop(col_to_split, axis=1) for i in range(0, resolution): try: df = df.rename(columns={i: f'{i+1}{tag}'}) except KeyError: pass return df
[docs]def extract_cat_count(df: pd.DataFrame, cat_values: List[str], cat_feature_names: List[str], prefix: str) -> pd.DataFrame: """ This method extracts the number of times a categorial value appears in a column :param df: dataframe containing the data :param cat_values: a List of unique categorial values :param cat_feature_names: a List of names of categorial features :param resolution: the resolution to use :return: a dataframe with categorial data represented as a count """ all_cat = df[cat_feature_names].agg(','.join, axis=1) total_arr = np.zeros((len(all_cat), len(cat_values))) for i, cat_list in enumerate(all_cat): for j, col in enumerate(cat_values): cat_count = cat_list.count(col) total_arr[i, j] = cat_count df = pd.DataFrame(total_arr, columns=cat_values, dtype=int) df = df.add_prefix(prefix) return df
[docs]def encode_vector_label(data: List[np.ndarray], n_num_cols: int = None) -> Tuple[List[np.ndarray], List[LabelEncoder]]: """ This method label-encodes categorial data :param data: a List of data NumPy arrays :param n_num_cols: number of numerial columns to skip :return: a Tuple of Lists with encoded data and encoders """ encoders = [] data_encoded = np.array(data) if n_num_cols is None: for i in range(data_encoded.shape[1]): le = LabelEncoder() le.fit(data_encoded[:, i]) encoders.append(le) data_encoded[:, i] = le.transform(data_encoded[:, i]) else: for i in range(n_num_cols, data_encoded.shape[1]): le = LabelEncoder() le.fit(data_encoded[:, i]) encoders.append(le) data_encoded[:, i] = le.transform(data_encoded[:, i]) data_encoded = data_encoded.astype(float) return data_encoded, encoders
[docs]def replace_cat_values(df: pd.DataFrame, mapping: pd.DataFrame) -> pd.DataFrame: """ This method replaces categorial values in dataframe by their real-world counterpart given a mapping :param df: a dataframe with the values to replace :param mapping: the mapping to use :return: a dataframe where categorial values have been replaced """ cat_dict = dict(mapping.values) df = df.replace(to_replace=cat_dict) return df