"""I/O functions for files related to ECG processing."""
import re
import warnings
from pathlib import Path
from typing import Optional, Sequence
import pandas as pd
from biopsykit.utils._datatype_validation_helper import _assert_file_extension, _assert_is_dir
from biopsykit.utils._types import path_t
from biopsykit.utils.datatype_helper import HeartRatePhaseDict, HeartRateSubjectDataDict, is_hr_phase_dict
from biopsykit.utils.file_handling import get_subject_dirs, is_excel_file
from biopsykit.utils.time import tz
__all__ = [
"load_hr_phase_dict",
"load_hr_phase_dict_folder",
"load_hr_phase_dict_csv",
"write_hr_phase_dict",
"write_hr_phase_dict_csv",
]
[docs]def load_hr_phase_dict(file_path: path_t, assert_format: Optional[bool] = True) -> HeartRatePhaseDict:
"""Load Excel file containing time series heart rate data of one subject.
The returned dictionary will be a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`,
i.e., a dict with heart rate data from one subject split into phases (as exported by :func:`write_hr_phase_dict`
or :func:`write_hr_phase_dict_csv`).
Parameters
----------
file_path : :class:`~pathlib.Path` or str
path to file
assert_format : bool, optional
whether to check if the imported dict is in the right format or not
Returns
-------
:obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
Dict with heart rate data split into phases
Raises
------
:exc:`~biopsykit.utils.exceptions.ValidationError`
if file in ``file_path`` is not a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
(if ``assert_format`` is ``True``)
:exc:`~biopsykit.utils.exceptions.FileExtensionError`
if file is no Excel file (`.xls` or `.xlsx`)
See Also
--------
~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
write_hr_phase_dict : Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to file
"""
# ensure pathlib
file_path = Path(file_path)
_assert_file_extension(file_path, (".xls", ".xlsx"))
# assure that the file is an Excel file
is_excel_file(file_path)
dict_hr: HeartRatePhaseDict = pd.read_excel(file_path, index_col="time", sheet_name=None)
if assert_format:
# assert that the dictionary is in the correct format
is_hr_phase_dict(dict_hr)
# (re-)localize each sheet since Excel does not support timezone-aware dates
dict_hr = {k: v.tz_localize(tz) for k, v in dict_hr.items()}
return dict_hr
[docs]def load_hr_phase_dict_folder(
base_path: path_t, filename_pattern: str, subfolder_pattern: Optional[str] = None
) -> HeartRateSubjectDataDict:
r"""Load a folder with multiple ``HeartRatePhaseDict`` and concatenate them into a ``HeartRateSubjectDataDict``.
This functions looks for all files that match the ``file_pattern`` in the folder specified by ``base_path``
and loads the files that are all expected to be :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`.
Subject IDs are extracted from the file name. Hence, ``file_pattern`` needs to be a regex
including a capture group, e.g. "ecg_results_(\\w+).xlsx".
Alternatively, if the files are stored in subfolders, the name pattern of these subfolders can be specified by
``subject_folder_pattern``. Then, it is expected that the subfolder names correspond to the subject IDs.
The returned dictionary will be a :obj:`~biopsykit.utils.datatype_helper.HeartRateSubjectDataDict`
with the following format:
{ ``subject_id`` : :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` }
Parameters
----------
base_path : :class: `~pathlib.Path` or str
path to top-level folder containing all subject folders
filename_pattern : str
filename pattern of exported :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`.
Must be a regex string with capture group to extract subject IDs, or a regular regex string
(without capture group) if ``subfolder_pattern`` is specified
subfolder_pattern : str, optional
subfolder name pattern if files are stored in subfolders.
Then, ``filename_pattern`` does **not** need to be a regex with a capture group because it is assumed that
the names of the subfolders correspond to the subject IDs.
Returns
-------
:obj:`~biopsykit.utils.datatype_helper.HeartRateSubjectDataDict`
:obj:`~biopsykit.utils.datatype_helper.HeartRateSubjectDataDict`, i.e., a dictionary with
:obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` of multiple subjects
Raises
------
:exc:`~biopsykit.utils.exceptions.ValidationError`
if any file that matches ``filename_pattern`` is not a
:obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
:exc:`FileNotFoundError`
if no files match ``filename_pattern`` or no subfolders match ``subfolder_pattern``
See Also
--------
~biopsykit.utils.file_handling.get_subject_dirs : Filter for subject subfolders in a given folder
load_hr_phase_dict : Load :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` for one subject
Examples
--------
>>> from biopsykit.io.ecg import load_hr_phase_dict_folder
>>> base_path = "./ecg_results/"
>>> # Option 1: all files are stored in `base_path`, subject IDs are extracted from the file names
>>> dict_hr_subjects = load_hr_phase_dict_folder(
>>> base_path,
>>> filename_pattern=r"ecg_result_(\\w+).xlsx")
>>> print(dict_hr_subjects)
{
'Vp01': {}, # one single HeartRatePhaseDict
'Vp02': {},
# ...
}
>>> # Option 2: files are stored in subfolders, the name of the subfolders is the corresponding subject ID
>>> dict_hr_subjects = load_hr_phase_dict_folder(
>>> base_path,
>>> filename_pattern=r"ecg_result*.xlsx",
>>> subfolder_pattern="Vp*")
>>> print(dict_hr_subjects)
{
'Vp01': {}, # one single HeartRatePhaseDict
'Vp02': {},
# ...
}
"""
# ensure pathlib
base_path = Path(base_path)
dict_hr_subjects = {}
if subfolder_pattern is None:
file_list = sorted(base_path.glob("*"))
file_list = [f for f in file_list if re.search(filename_pattern, f.name)]
if len(file_list) == 0:
raise FileNotFoundError(f"No files matching the pattern '{filename_pattern}' found in {base_path}.")
for file in file_list:
subject_id = re.findall(filename_pattern, file.name)[0]
dict_hr_subjects[subject_id] = load_hr_phase_dict(file)
else:
subject_dirs = get_subject_dirs(base_path, subfolder_pattern)
for subject_dir in subject_dirs:
subject_id = subject_dir.name
hr_phase_dict = _load_hr_phase_dict_single_subject(subject_dir, filename_pattern)
if hr_phase_dict is None:
continue
dict_hr_subjects[subject_id] = hr_phase_dict
return dict_hr_subjects
def _load_hr_phase_dict_single_subject(subject_dir: Path, filename_pattern: str) -> Optional[HeartRatePhaseDict]:
subject_id = subject_dir.name
# first try to search for files with glob (assuming that a regex string without capture group was passed),
# then try to search via regex search (assuming that a regex string with capture group was passed,
# which should actually not be done if subfolder_pattern is passed)
file_list = sorted(subject_dir.glob(filename_pattern))
if len(file_list) == 0:
file_list = sorted(subject_dir.glob("*"))
# then extract the ones that match
file_list = [f for f in file_list if re.search(filename_pattern, f.name)]
if len(file_list) == 1:
return load_hr_phase_dict(file_list[0])
if len(file_list) > 1:
warnings.warn(
'More than one file matching file pattern "{}" found in folder {}. '
"Trying to merge these files into one HeartRatePhaseDict".format(filename_pattern, subject_dir)
)
dict_hr = {}
for file in file_list:
dict_hr.update(load_hr_phase_dict(file))
return dict_hr
print(f"No Heart Rate data for subject {subject_id}")
return None
[docs]def write_hr_phase_dict(hr_phase_dict: HeartRatePhaseDict, file_path: path_t):
"""Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to an Excel file.
The :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` is a dictionary with heart rate time
series data split into phases.
Each of the phases in the dictionary will be a separate sheet in the Excel file.
Parameters
----------
hr_phase_dict : :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` containing pandas dataframes
with heart rate data
file_path : :class:`~pathlib.Path` or str
path to export file
See Also
--------
~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
load_hr_phase_dict : Load :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` written to file
~biopsykit.io.write_pandas_dict_excel : Write dictionary with pandas dataframes to Excel file
"""
from biopsykit.io import write_pandas_dict_excel # pylint:disable=import-outside-toplevel
# assert that file path is an Excel file
is_excel_file(file_path)
# assert that dict is in the right format
is_hr_phase_dict(hr_phase_dict)
write_pandas_dict_excel(hr_phase_dict, file_path)
[docs]def write_hr_phase_dict_csv(hr_phase_dict: HeartRatePhaseDict, folder_path: path_t, file_pattern: path_t):
"""Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to a series of csv files.
The :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` is a dictionary with heart rate time
series data split into phases.
Each of the phases in the dictionary will be a separate csv file.
Parameters
----------
hr_phase_dict : :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` containing pandas dataframes
with heart rate data
folder_path : :class:`~pathlib.Path` or str
folder path to export csv files
file_pattern : :class:`~pathlib.Path` or str
file pattern to save. The file pattern must include a placeholder "{}" which will be filled with the
name of the phase.
Raises
------
ValueError
if ``file_pattern`` does not include a placeholder "{}" that can be filled with the phase name
:exc:`~biopsykit.utils.exceptions.ValidationError`
if ``hr_phase_dict`` is not a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
:exc:`~biopsykit.utils.exceptions.FileExtensionError`
if ``file_pattern`` is no csv file
See Also
--------
~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
load_hr_phase_dict_csv : Load :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` written to csv files
"""
# ensure pathlib
folder_path = Path(folder_path)
_assert_is_dir(folder_path)
# check file pattern for sanity
file_pattern = str(file_pattern)
if "{}" not in file_pattern:
raise ValueError(
f"'file_pattern' must include a placeholder to be filled with the phase name!. Got {file_pattern}."
)
is_hr_phase_dict(hr_phase_dict)
for phase, data in hr_phase_dict.items():
file_name = Path(file_pattern.format(phase))
file_path = folder_path.joinpath(file_name)
_assert_file_extension(file_path, ".csv")
data.to_csv(file_path)
[docs]def load_hr_phase_dict_csv(
folder_path: path_t,
file_pattern: path_t,
phase_order: Optional[Sequence[str]] = None,
assert_format: Optional[bool] = True,
) -> HeartRatePhaseDict:
"""Load csv file with time series HR data of one subject from folder and combine it into a ``HeartRatePhaseDict``.
The returned dictionary will be a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`,
i.e., a dict with heart rate data from one subject split into phases (as exported by :func:`write_hr_phase_dict`).
Parameters
----------
folder_path : :class:`~pathlib.Path` or str
folder path to export csv files
file_pattern : :class:`~pathlib.Path` or str
file pattern of the csv files. `file_pattern` must include a regex capture group (see Examples) which is used
to extract the phase name from the file name.
phase_order : list of str, optional
list of phase names to order resulting :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` or ``None``
to order phases according to the file name ordering in ``folder_path``.
Default: ``None``
assert_format : bool, optional
whether to check if the imported dict is in the right format or not
Returns
-------
:obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
Dict with heart rate data split into phases
Raises
------
:exc:`~biopsykit.utils.exceptions.ValidationError`
if file in ``file_path`` is not a :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict`
(if ``assert_format`` is ``True``)
:exc:`~biopsykit.utils.exceptions.FileExtensionError`
if file is no Excel file (`.xls` or `.xlsx`)
See Also
--------
~biopsykit.utils.datatype_helper.HeartRatePhaseDict : Dictionary format
write_hr_phase_dict_csv : Write :obj:`~biopsykit.utils.datatype_helper.HeartRatePhaseDict` to a series of csv files
"""
# ensure pathlib
folder_path = Path(folder_path)
_assert_is_dir(folder_path)
file_list = sorted(folder_path.glob("*.csv"))
if len(file_list) == 0:
raise ValueError(f"No csv files found in {folder_path}!")
dict_hr = {}
for file in file_list:
phase_name = re.findall(file_pattern, file.name)
if len(phase_name) == 0:
raise ValueError(f"No string matching the pattern {file_pattern} found in {file}!")
if len(phase_name) > 1:
raise ValueError(f"Too many strings matching the pattern {file_pattern} found in {file}!")
phase_name = phase_name[0]
df = pd.read_csv(file, index_col="time")
df.index = pd.to_datetime(df.index, errors="ignore")
dict_hr[phase_name] = df
if assert_format:
# assert that the dictionary is in the correct format
is_hr_phase_dict(dict_hr)
if phase_order is not None:
_assert_phase_names(folder_path, phase_order, dict_hr.keys())
dict_hr = {phase: dict_hr[phase] for phase in phase_order}
return dict_hr
def _assert_phase_names(folder_path, phase_order, dict_keys):
if len(phase_order) != len(dict_keys):
raise ValueError(
f"Number of phases provided by 'phase_order' do not match. "
f"Expected {len(dict_keys)}, got {len(phase_order)}."
)
if set(phase_order) != set(dict_keys):
raise ValueError(
f"Phases provided by 'phase_order' do not match the phases of the files in {folder_path}. "
f"Expected {phase_order}, got {list(dict_keys)}."
)