Source code for csverve.core.csverve_input
import os
from typing import List, Dict, Union, Any
import pandas as pd # type: ignore
import yaml
from csverve.errors import CsverveParseError
[docs]class CsverveInput(object):
def __init__(self, filepath: str) -> None:
"""
CSV file and all related metadata.
@param filepath: Path of CSV.
"""
self.filepath: str = filepath
self._verify_input()
self._yamldata = self._load_yaml()
@property
def header(self) -> bool:
"""
True if file has header
@return: header
"""
return self._yamldata['header']
@property
def separator(self) -> str:
"""
get the separator used
@return: separator
"""
return self._yamldata['sep']
@property
def columns(self) -> List[str]:
"""
get the list of columns
@return: separator
"""
return [val['name'] for val in self._yamldata['columns']]
@property
def dtypes(self) -> Dict[str, str]:
"""
get the data types
@return: dtypes
"""
return {val['name']: val['dtype'] for val in self._yamldata['columns']}
@property
def yaml_file(self) -> str:
"""
Append '.yaml' to CSV path.
@return: YAML metadata path.
"""
return self.filepath + '.yaml'
def _load_yaml(self) -> Dict[str, Any]:
"""
load the yaml data
@return: Dict
"""
with open(self.yaml_file, 'rt') as yamlfile:
yamldata = yaml.safe_load(yamlfile)
return yamldata
def _verify_input(self):
"""
Verify gzip status and check for yaml
@return:
"""
if not self.filepath.endswith('.gz'):
raise CsverveParseError('input must be gzipped')
if not os.path.exists(self.yaml_file):
raise CsverveParseError('yaml file missing')
def _cast_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Cast dataframe dtypes.
@param df: Pandas DataFrame.
@return: Pandas DataFrame.
"""
for column_name in df.columns.values:
df[column_name] = df[column_name].astype(self.dtypes[column_name])
return df
def _verify_data(self, df: pd.DataFrame, columns) -> None:
"""
Verify columns of DataFrame match those of class property.
@param df: Pandas DataFrame.
@return:
"""
if not set(list(df.columns.values)) == set(columns):
raise CsverveParseError("metadata mismatch in {}".format(self.filepath))
[docs] def read_csv(self, chunksize: int = None, usecols=None, dtype=None) -> pd.DataFrame:
"""
Read CSV.
@param chunksize: Number of rows to read at a time (optional, applies to large datasets).
@param usecols: Restrict to specific columns (optional).
@param dtype: Override the dtypes on specific columns (optional).
@return: pandas DataFrame.
"""
def return_gen(df_iterator, columns):
for df in df_iterator:
self._verify_data(df, columns)
yield df
# if header exists then use first line (0) as header
header: Union[int, None] = 0 if self.header else None
names: Union[None, List[str]] = None if self.header else self.columns
columns: List[str] = usecols if usecols else self.columns
# Override dtypes
final_dtype = self.dtypes
if dtype is not None:
for name, dtype in dtype.items():
if name in final_dtype:
final_dtype[name] = dtype
else:
raise ValueError(f'dtype column {name} not present')
try:
data: pd.DataFrame = pd.read_csv(
self.filepath,
compression='gzip',
chunksize=chunksize,
sep=self.separator,
header=header,
names=names,
dtype=final_dtype,
usecols=usecols
)
except pd.errors.EmptyDataError:
data = pd.DataFrame(columns=columns)
data = self._cast_dataframe(data)
if chunksize:
return return_gen(data, columns)
else:
self._verify_data(data, columns)
return data