Source code for csverve.core.irregular_csv_input

import gzip
from typing import List, Dict, Any

import pandas as pd  # type: ignore
from csverve.errors import CsverveInputError


[docs]class IrregularCsverveInput(object):
    def __init__(self, filepath: str, dtypes: Dict[str, str], sep=',') -> None:
        """
        CSV file and all related metadata.

        @param filepath: Path of CSV.
        @param dtypes: dictionary of pandas dtypes by column, keys = column name, value = dtype.
        """
        self.filepath: str = filepath

        self.sep = sep
        self.columns = self.get_columns()

        self.dtypes: Dict[str, str] = dtypes

    @property
    def __file_type(self) -> str:
        if self.filepath.endswith('gz'):
            return 'gzip'
        elif self.filepath.endswith('csv'):
            return 'plain-text'
        else:
            raise CsverveInputError('Unsupported file type: {}'.format(self.filepath))

    @property
    def yaml_file(self) -> str:
        """
        Append '.yaml' to CSV path.

        @return: YAML metadata path.
        """
        return self.filepath + '.yaml'

[docs]    def get_columns(self) -> List[str]:
        """
        Detect whether file is tab or comma separated from header.
        @return: '\t', or ',', or raise error if unable to detect separator.
        """
        opener: Any = gzip.open if self.__file_type == 'gzip' else open
        with opener(self.filepath, 'rt') as inputfile:
            header: str = inputfile.readline().strip()

        columns: List[str] = header.split(self.sep)

        return columns

[docs]    def read_csv(self, chunksize: int = None) -> pd.DataFrame:
        """
        Read CSV.

        @param chunksize: Number of rows to read at a time (optional, applies to large datasets).
        @return: pandas DataFrame.
        """
        try:
            data: pd.DataFrame = pd.read_csv(
                self.filepath, chunksize=chunksize,
                sep=self.sep, names=self.columns, dtype=self.dtypes
            )
        except pd.errors.EmptyDataError:
            data = pd.DataFrame(columns=self.columns)

        if chunksize:
            for df in data:
                for col in data.columns.values:
                    assert col in self.dtypes, col
                yield df
        else:
            for col in data.columns.values:
                assert col in self.dtypes, col
            return data