Source code for tools.file_reader
"""
file_reader.py
====================================
File reader module to read files.
"""
from typing import List
import numpy as np
import joblib
import pickle
import csv
from io import StringIO, BytesIO
import pandas as pd
[docs]def read_csv(infile: StringIO, header: str='infer',
sep: str=',', usecols: List[int]=None,
names: List[str]=None, converters: dict=None,
encoding=None, skiprows=None) -> pd.DataFrame:
"""
This method reads a csv file using Pandas read_csv() method
:param infile: text input stream
:param header: file header
:param sep: seperator identifier
:param names: list of column names to use
:param converters: dict of converters to use
:return: the csv file
"""
return pd.read_csv(infile, header=header, sep=sep, usecols=usecols,
names=names, converters=converters,
encoding=encoding, skiprows=skiprows)
[docs]def read_embedding(infile: StringIO) -> dict:
"""
This method reads an embedding file
:param infile: text input stream
:return: the embedding as a dict
"""
reader = csv.reader(infile)
embedding_dict = {rows[0]:rows[1] for rows in reader}
return embedding_dict
[docs]def read_array(infile: BytesIO) -> np.ndarray:
"""
This method reads an NumPy array file as a pickle
:param infile: binary input stream
:return: the NumPy array object
"""
return np.load(infile)
[docs]def read_pickle(infile: BytesIO) -> any:
"""
This method reads any file stored as a pickle
:param infile: binary input stream
:return: the file object
"""
data = pickle.load(infile)
return data
[docs]def read_joblib(infile: BytesIO) -> any:
"""
This method reads a joblib file
:param infile: binary input stream
:return: the joblib file
"""
return joblib.load(infile)
[docs]def read_excelfile(infile: BytesIO, converters: dict=None) -> pd.DataFrame:
"""
This method reads an excel file
:param infile: binary input stream
:param converters: dict of converters to use
:return: the excel file as a dataframe
"""
df = pd.read_excel(infile,
engine='openpyxl',
converters=converters)
return df
[docs]def read_excelfile_sheets(infile: BytesIO, n_sheets: int,
converters: dict=None) -> pd.DataFrame:
"""
This method reads sheets from an excel file
:param infile: binary input stream
:param n_sheets: number of sheets to read
:param converters: dict of converters to use
:return: the full excel file as a dataframe
"""
file = pd.ExcelFile(infile, engine='openpyxl')
full_file = pd.DataFrame()
for i in range(n_sheets):
df = file.parse(file.sheet_names[i], converters=converters)
full_file = pd.concat([full_file, df])
return full_file