Source code for tools.file_reader

"""
file_reader.py
====================================
File reader module to read files.
"""

from typing import List
import numpy as np
import joblib
import pickle
import csv
from io import StringIO, BytesIO
import pandas as pd

[docs]def read_csv(infile: StringIO, header: str='infer', sep: str=',', usecols: List[int]=None, names: List[str]=None, converters: dict=None, encoding=None, skiprows=None) -> pd.DataFrame: """ This method reads a csv file using Pandas read_csv() method :param infile: text input stream :param header: file header :param sep: seperator identifier :param names: list of column names to use :param converters: dict of converters to use :return: the csv file """ return pd.read_csv(infile, header=header, sep=sep, usecols=usecols, names=names, converters=converters, encoding=encoding, skiprows=skiprows)
[docs]def read_embedding(infile: StringIO) -> dict: """ This method reads an embedding file :param infile: text input stream :return: the embedding as a dict """ reader = csv.reader(infile) embedding_dict = {rows[0]:rows[1] for rows in reader} return embedding_dict
[docs]def read_array(infile: BytesIO) -> np.ndarray: """ This method reads an NumPy array file as a pickle :param infile: binary input stream :return: the NumPy array object """ return np.load(infile)
[docs]def read_pickle(infile: BytesIO) -> any: """ This method reads any file stored as a pickle :param infile: binary input stream :return: the file object """ data = pickle.load(infile) return data
[docs]def read_joblib(infile: BytesIO) -> any: """ This method reads a joblib file :param infile: binary input stream :return: the joblib file """ return joblib.load(infile)
[docs]def read_excelfile(infile: BytesIO, converters: dict=None) -> pd.DataFrame: """ This method reads an excel file :param infile: binary input stream :param converters: dict of converters to use :return: the excel file as a dataframe """ df = pd.read_excel(infile, engine='openpyxl', converters=converters) return df
[docs]def read_excelfile_sheets(infile: BytesIO, n_sheets: int, converters: dict=None) -> pd.DataFrame: """ This method reads sheets from an excel file :param infile: binary input stream :param n_sheets: number of sheets to read :param converters: dict of converters to use :return: the full excel file as a dataframe """ file = pd.ExcelFile(infile, engine='openpyxl') full_file = pd.DataFrame() for i in range(n_sheets): df = file.parse(file.sheet_names[i], converters=converters) full_file = pd.concat([full_file, df]) return full_file