Source code for otoole.preprocess.longify_data

"""Read in a folder of irregular wide-format csv files and write them out as narrow csvs
"""
import logging
from typing import Dict

import numpy as np
import pandas as pd

logger = logging.getLogger()


[docs]def check_set_datatype( narrow: pd.DataFrame, config_details: Dict, set_name: str ) -> pd.DataFrame: """Checks the datatypes of a set_name dataframe Arguments --------- narrow : pandas.DataFrame The set data config_details : dict The configuration dictionary set_name : str The name of the set """ datatype = config_details[set_name]["dtype"] logger.debug("Columns for set %s are: %s", set_name, narrow.columns) if narrow.iloc[:, 0].dtype != datatype: logger.info("dtype does not match %s for set %s", datatype, set_name) return narrow
[docs]def check_datatypes( df: pd.DataFrame, config_details: Dict, parameter: str ) -> pd.DataFrame: """Checks a parameters datatypes Arguments --------- df : pandas.DataFrame The parameter data config_details : dict The configuration dictionary parameter : str The name of the parameter """ logger.info("Checking datatypes for %s", parameter) logger.debug(df.columns) dtypes = {} for column in df.columns: if column == "VALUE": datatype = config_details[parameter]["dtype"] dtypes["VALUE"] = datatype else: datatype = config_details[column]["dtype"] dtypes[column] = datatype logger.debug(f"Found {datatype} for column {column}") if df[column].dtype != datatype: logger.info( "dtype of column %s does not match %s for parameter %s", column, datatype, parameter, ) if datatype == "int": dtypes[column] = "int64" try: df[column] = df[column].apply(_cast_to_int) except ValueError as ex: msg = "Unable to apply datatype for column {}: {}".format( column, str(ex) ) raise ValueError(msg) return df.astype(dtypes)
def _cast_to_int(value): return np.int64(float(value))