Source code for biopsykit.io.ecg

"""I/O functions for files related to ECG processing."""
import re
import warnings
from pathlib import Path
from typing import Optional, Sequence

import pandas as pd
from biopsykit.utils._datatype_validation_helper import _assert_file_extension, _assert_is_dir
from biopsykit.utils._types import path_t
from biopsykit.utils.datatype_helper import HeartRatePhaseDict, HeartRateSubjectDataDict, is_hr_phase_dict
from biopsykit.utils.file_handling import get_subject_dirs, is_excel_file
from biopsykit.utils.time import tz

__all__ = [
    "load_hr_phase_dict",
    "load_hr_phase_dict_folder",
    "load_hr_phase_dict_csv",
    "write_hr_phase_dict",
    "write_hr_phase_dict_csv",
]


[docs]def load_hr_phase_dict(file_path: path_t, assert_format: Optional[bool] = True) -> HeartRatePhaseDict:
    """Load Excel file containing time series heart rate data of one subject.

    The returned dictionary will be a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`,
    i.e., a dict with heart rate data from one subject split into phases (as exported by :func:`write_hr_phase_dict`
    or :func:`write_hr_phase_dict_csv`).

    Parameters
    ----------
    file_path : :class:`~pathlib.Path` or str
        path to file
    assert_format : bool, optional
        whether to check if the imported dict is in the right format or not

    Returns
    -------
    :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
        Dict with heart rate data split into phases

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if file in ``file_path`` is not a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
        (if ``assert_format`` is ``True``)
    :exc:`~biopsykit.utils.exceptions.FileExtensionError`
        if file is no Excel file (`.xls` or `.xlsx`)

    See Also
    --------
    ~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
    write_hr_phase_dict : Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to file

    """
    # ensure pathlib
    file_path = Path(file_path)
    _assert_file_extension(file_path, (".xls", ".xlsx"))

    # assure that the file is an Excel file
    is_excel_file(file_path)

    dict_hr: HeartRatePhaseDict = pd.read_excel(file_path, index_col="time", sheet_name=None)
    if assert_format:
        # assert that the dictionary is in the correct format
        is_hr_phase_dict(dict_hr)

    # (re-)localize each sheet since Excel does not support timezone-aware dates
    dict_hr = {k: v.tz_localize(tz) for k, v in dict_hr.items()}
    return dict_hr


[docs]def load_hr_phase_dict_folder(
    base_path: path_t, filename_pattern: str, subfolder_pattern: Optional[str] = None
) -> HeartRateSubjectDataDict:
    r"""Load a folder with multiple ``HeartRatePhaseDict`` and concatenate them  into a ``HeartRateSubjectDataDict``.

    This functions looks for all files that match the ``file_pattern`` in the folder specified by ``base_path``
    and loads the files that are all expected to be :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`.

    Subject IDs are extracted from the file name. Hence, ``file_pattern`` needs to be a regex
    including a capture group, e.g. "ecg_results_(\\w+).xlsx".

    Alternatively, if the files are stored in subfolders, the name pattern of these subfolders can be specified by
    ``subject_folder_pattern``. Then, it is expected that the subfolder names correspond to the subject IDs.

    The returned dictionary will be a :obj:`~biopsykit.utils.datatype_helper.HeartRateSubjectDataDict`
    with the following format:

    { ``subject_id`` : :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` }


    Parameters
    ----------
    base_path : :class: `~pathlib.Path` or str
        path to top-level folder containing all subject folders
    filename_pattern : str
        filename pattern of exported :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`.
        Must be a regex string with capture group to extract subject IDs, or a regular regex string
        (without capture group) if ``subfolder_pattern`` is specified
    subfolder_pattern : str, optional
        subfolder name pattern if files are stored in subfolders.
        Then, ``filename_pattern`` does **not** need to be a regex with a capture group because it is assumed that
        the names of the subfolders correspond to the subject IDs.

    Returns
    -------
    :obj:`~biopsykit.utils.datatype_helper.HeartRateSubjectDataDict`
        :obj:`~biopsykit.utils.datatype_helper.HeartRateSubjectDataDict`, i.e., a dictionary with
        :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` of multiple subjects

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if any file that matches ``filename_pattern`` is not a
        :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
    :exc:`FileNotFoundError`
        if no files match ``filename_pattern`` or no subfolders match ``subfolder_pattern``

    See Also
    --------
    ~biopsykit.utils.file_handling.get_subject_dirs : Filter for subject subfolders in a given folder
    load_hr_phase_dict : Load :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` for one subject

    Examples
    --------
    >>> from biopsykit.io.ecg import load_hr_phase_dict_folder
    >>> base_path = "./ecg_results/"
    >>> # Option 1: all files are stored in `base_path`, subject IDs are extracted from the file names
    >>> dict_hr_subjects = load_hr_phase_dict_folder(
    >>>                         base_path,
    >>>                         filename_pattern=r"ecg_result_(\\w+).xlsx")
    >>> print(dict_hr_subjects)
    {
         'Vp01': {}, # one single HeartRatePhaseDict
         'Vp02': {},
         # ...
    }
    >>> # Option 2: files are stored in subfolders, the name of the subfolders is the corresponding subject ID
    >>> dict_hr_subjects = load_hr_phase_dict_folder(
    >>>                         base_path,
    >>>                         filename_pattern=r"ecg_result*.xlsx",
    >>>                         subfolder_pattern="Vp*")
    >>> print(dict_hr_subjects)
    {
         'Vp01': {}, # one single HeartRatePhaseDict
         'Vp02': {},
         # ...
    }

    """
    # ensure pathlib
    base_path = Path(base_path)

    dict_hr_subjects = {}
    if subfolder_pattern is None:
        file_list = sorted(base_path.glob("*"))
        file_list = [f for f in file_list if re.search(filename_pattern, f.name)]
        if len(file_list) == 0:
            raise FileNotFoundError(f"No files matching the pattern '{filename_pattern}' found in {base_path}.")
        for file in file_list:
            subject_id = re.findall(filename_pattern, file.name)[0]
            dict_hr_subjects[subject_id] = load_hr_phase_dict(file)
    else:
        subject_dirs = get_subject_dirs(base_path, subfolder_pattern)

        for subject_dir in subject_dirs:
            subject_id = subject_dir.name
            hr_phase_dict = _load_hr_phase_dict_single_subject(subject_dir, filename_pattern)
            if hr_phase_dict is None:
                continue
            dict_hr_subjects[subject_id] = hr_phase_dict
    return dict_hr_subjects


def _load_hr_phase_dict_single_subject(subject_dir: Path, filename_pattern: str) -> Optional[HeartRatePhaseDict]:
    subject_id = subject_dir.name
    # first try to search for files with glob (assuming that a regex string without capture group was passed),
    # then try to search via regex search (assuming that a regex string with capture group was passed,
    # which should actually not be done if subfolder_pattern is passed)
    file_list = sorted(subject_dir.glob(filename_pattern))
    if len(file_list) == 0:
        file_list = sorted(subject_dir.glob("*"))
        # then extract the ones that match
        file_list = [f for f in file_list if re.search(filename_pattern, f.name)]

    if len(file_list) == 1:
        return load_hr_phase_dict(file_list[0])
    if len(file_list) > 1:
        warnings.warn(
            'More than one file matching file pattern "{}" found in folder {}. '
            "Trying to merge these files into one HeartRatePhaseDict".format(filename_pattern, subject_dir)
        )
        dict_hr = {}
        for file in file_list:
            dict_hr.update(load_hr_phase_dict(file))
        return dict_hr

    print(f"No Heart Rate data for subject {subject_id}")
    return None


[docs]def write_hr_phase_dict(hr_phase_dict: HeartRatePhaseDict, file_path: path_t):
    """Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to an Excel file.

    The :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` is a dictionary with heart rate time
    series data split into phases.

    Each of the phases in the dictionary will be a separate sheet in the Excel file.

    Parameters
    ----------
    hr_phase_dict : :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
        a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` containing pandas dataframes
        with heart rate data
    file_path : :class:`~pathlib.Path` or str
        path to export file

    See Also
    --------
    ~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
    load_hr_phase_dict : Load :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` written to file
    ~biopsykit.io.write_pandas_dict_excel : Write dictionary with pandas dataframes to Excel file

    """
    from biopsykit.io import write_pandas_dict_excel  # pylint:disable=import-outside-toplevel

    # assert that file path is an Excel file
    is_excel_file(file_path)

    # assert that dict is in the right format
    is_hr_phase_dict(hr_phase_dict)
    write_pandas_dict_excel(hr_phase_dict, file_path)


[docs]def write_hr_phase_dict_csv(hr_phase_dict: HeartRatePhaseDict, folder_path: path_t, file_pattern: path_t):
    """Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to a series of csv files.

    The :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` is a dictionary with heart rate time
    series data split into phases.

    Each of the phases in the dictionary will be a separate csv file.

    Parameters
    ----------
    hr_phase_dict : :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
        a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` containing pandas dataframes
        with heart rate data
    folder_path : :class:`~pathlib.Path` or str
        folder path to export csv files
    file_pattern : :class:`~pathlib.Path` or str
        file pattern to save. The file pattern must include a placeholder "{}" which will be filled with the
        name of the phase.

    Raises
    ------
    ValueError
        if ``file_pattern`` does not include a placeholder "{}" that can be filled with the phase name
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if ``hr_phase_dict`` is not a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
    :exc:`~biopsykit.utils.exceptions.FileExtensionError`
        if ``file_pattern`` is no csv file

    See Also
    --------
    ~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
    load_hr_phase_dict_csv : Load :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` written to csv files

    """
    # ensure pathlib
    folder_path = Path(folder_path)
    _assert_is_dir(folder_path)

    # check file pattern for sanity
    file_pattern = str(file_pattern)
    if "{}" not in file_pattern:
        raise ValueError(
            f"'file_pattern' must include a placeholder to be filled with the phase name!. Got {file_pattern}."
        )
    is_hr_phase_dict(hr_phase_dict)

    for phase, data in hr_phase_dict.items():
        file_name = Path(file_pattern.format(phase))
        file_path = folder_path.joinpath(file_name)
        _assert_file_extension(file_path, ".csv")
        data.to_csv(file_path)


[docs]def load_hr_phase_dict_csv(
    folder_path: path_t,
    file_pattern: path_t,
    phase_order: Optional[Sequence[str]] = None,
    assert_format: Optional[bool] = True,
) -> HeartRatePhaseDict:
    """Load csv file with time series HR data of one subject from folder and combine it into a ``HeartRatePhaseDict``.

    The returned dictionary will be a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`,
    i.e., a dict with heart rate data from one subject split into phases (as exported by :func:`write_hr_phase_dict`).

    Parameters
    ----------
    folder_path : :class:`~pathlib.Path` or str
        folder path to export csv files
    file_pattern : :class:`~pathlib.Path` or str
        file pattern of the csv files. `file_pattern` must include a regex capture group (see Examples) which is used
        to extract the phase name from the file name.
    phase_order : list of str, optional
        list of phase names to order resulting :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` or ``None``
        to order phases according to the file name ordering in ``folder_path``.
        Default: ``None``
    assert_format : bool, optional
        whether to check if the imported dict is in the right format or not

    Returns
    -------
    :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
        Dict with heart rate data split into phases

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if file in ``file_path`` is not a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
        (if ``assert_format`` is ``True``)
    :exc:`~biopsykit.utils.exceptions.FileExtensionError`
        if file is no Excel file (`.xls` or `.xlsx`)

    See Also
    --------
    ~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
    write_hr_phase_dict_csv : Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to a series of csv files

    """
    # ensure pathlib
    folder_path = Path(folder_path)
    _assert_is_dir(folder_path)

    file_list = sorted(folder_path.glob("*.csv"))
    if len(file_list) == 0:
        raise ValueError(f"No csv files found in {folder_path}!")

    dict_hr = {}
    for file in file_list:
        phase_name = re.findall(file_pattern, file.name)
        if len(phase_name) == 0:
            raise ValueError(f"No string matching the pattern {file_pattern} found in {file}!")
        if len(phase_name) > 1:
            raise ValueError(f"Too many strings matching the pattern {file_pattern} found in {file}!")
        phase_name = phase_name[0]
        df = pd.read_csv(file, index_col="time")
        df.index = pd.to_datetime(df.index, errors="ignore")
        dict_hr[phase_name] = df

    if assert_format:
        # assert that the dictionary is in the correct format
        is_hr_phase_dict(dict_hr)

    if phase_order is not None:
        _assert_phase_names(folder_path, phase_order, dict_hr.keys())
        dict_hr = {phase: dict_hr[phase] for phase in phase_order}

    return dict_hr


def _assert_phase_names(folder_path, phase_order, dict_keys):
    if len(phase_order) != len(dict_keys):
        raise ValueError(
            f"Number of phases provided by 'phase_order' do not match. "
            f"Expected {len(dict_keys)}, got {len(phase_order)}."
        )
    if set(phase_order) != set(dict_keys):
        raise ValueError(
            f"Phases provided by 'phase_order' do not match the phases of the files in {folder_path}. "
            f"Expected {phase_order}, got {list(dict_keys)}."
        )