Source code for biopsykit.questionnaires.utils

"""Module containing utility functions for manipulating and processing questionnaire data."""
import re
import warnings
from inspect import getmembers, isfunction
from typing import Any, Dict, Literal, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
from biopsykit.utils._datatype_validation_helper import _assert_is_dtype, _assert_len_list, _assert_value_range
from biopsykit.utils.dataframe_handling import wide_to_long as wide_to_long_utils

__all__ = [
    "bin_scale",
    "compute_scores",
    "crop_scale",
    "convert_scale",
    "find_cols",
    "zero_pad_columns",
    "invert",
    "to_idx",
    "wide_to_long",
    "get_supported_questionnaires",
]


[docs]def find_cols(
    data: pd.DataFrame,
    regex_str: Optional[str] = None,
    starts_with: Optional[str] = None,
    ends_with: Optional[str] = None,
    contains: Optional[str] = None,
    zero_pad_numbers: Optional[bool] = True,
) -> Tuple[pd.DataFrame, Sequence[str]]:
    r"""Find columns in dataframe that match a specific pattern.

    This function is useful to find all columns that belong to a questionnaire. Column names can be filtered based on
    one (or a combination of) the following criteria:

    * ``starts_with``: columns have to start with the specified string
    * ``ends_with``: columns have to end with the specified string
    * ``contains``: columns have to contain the specified string

    Optionally, the item numbers in the matching column names can be zero-padded, if they are not already.

    .. note::
        If ``zero_pad_numbers`` is ``True`` then the column names returned by this function will be renamed and might
        thus not match the column names of the original dataframe. To solve this, make sure your orignal dataframe
        already has zero-padded columns (by manually renaming them) or convert column names using
        :func:`~biopsykit.questionnaires.utils.zero_pad_columns`.

    .. warning::
        Zero-padding using :func:`~biopsykit.questionnaires.utils.zero_pad_columns` assumes, by default, that numbers
        are *at the end* of column names. If you want to change that behavior
        (e.g., because the column names have string suffixes), you might need to apply zero-padding manually.



    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe with columns to be filtered
    regex_str : str, optional
        regex string to extract column names. If this parameter is passed the other parameters (``starts_with``,
        ``ends_with``, ``contains``) will be ignored. Default: ``None``
    starts_with : str, optional
        string columns have to start with. Default: ``None``
    ends_with : str, optional
        string columns have to end with. Default: ``None``
    contains : str, optional
        string columns have to contain. Default: ``None``
    zero_pad_numbers : bool, optional
        whether to zero-pad numbers in column names. Default: ``True``

    Returns
    -------
    data_filt : :class:`~pandas.DataFrame`
        dataframe with filtered columns that match the specified pattern
    cols : :class:`~pandas.Index`
        columns that match the specified pattern

    Examples
    --------
    >>> import biopsykit as bp
    >>> import pandas as pd
    >>> # Option 1: has to start with "XX"
    >>> data = pd.DataFrame(columns=["XX_{}".format(i) for i in range(1, 11)])
    >>> df, cols = bp.questionnaires.utils.find_cols(data, starts_with="XX")
    >>> print(cols)
    >>> ["XX_01", "XX_02", ..., "XX_10"]
    >>> # Option 2: has to end with "Post"
    >>> data = pd.DataFrame(columns=["XX_1_Pre", "XX_2_Pre", "XX_3_Pre", "XX_1_Post", "XX_2_Post", "XX_3_Post"])
    >>> df, cols = bp.questionnaires.utils.find_cols(data, ends_with="Post")
    >>> print(cols)
    >>> ["XX_01_Post", "XX_02_Post", "XX_03_Post"]
    >>> # Option 3: has to start with "XX" and end with "Post"
    >>> data = pd.DataFrame(columns=["XX_1_Pre", "XX_2_Pre", "XX_3_Pre", "XX_1_Post", "XX_2_Post", "XX_3_Post",
     "YY_1_Pre", "YY_2_Pre", "YY_1_Post", "YY_2_Post"])
    >>> bp.questionnaires.utils.find_cols(data, starts_with="XX", ends_with="Post")
    >>> print(cols)
    >>> # WARNING: this will not zero-pad the questionnaire numbers!
    >>> ["XX_1_Post", "XX_2_Post", "XX_3_Post"]
    >>> # Option 4: pass custom regex string
    >>> data = pd.DataFrame(columns=["XX_1_Pre", "XX_2_Pre", "XX_3_Pre", "XX_1_Post", "XX_2_Post", "XX_3_Post",
     "YY_1_Pre", "YY_2_Pre", "YY_1_Post", "YY_2_Post"])
    >>> bp.questionnaires.utils.find_cols(data, regex_str=r"XX_\d+_\w+")
    >>> print(cols)
    >>> # here, zero-padding will be possible again
    >>> ["XX_01_Post", "XX_02_Post", "XX_03_Post"]
    >>> # Option 5: disable zero-padding
    >>> data = pd.DataFrame(columns=["XX_{}".format(i) for i in range(1, 11)])
    >>> df, cols = bp.questionnaires.utils.find_cols(data, starts_with="XX", zero_pad_numbers=False)
    >>> print(cols)
    >>> ["XX_1", "XX_2", ..., "XX_10"]

    """
    _assert_is_dtype(data, pd.DataFrame)
    data_filt = data.copy()

    if regex_str:
        data_filt = data_filt.filter(regex=regex_str)
    else:
        if starts_with:
            data_filt = data_filt.filter(regex="^" + starts_with)
        if ends_with:
            data_filt = data_filt.filter(regex=ends_with + "$")
        if contains:
            data_filt = data_filt.filter(regex=contains)

    if zero_pad_numbers:
        data_filt = zero_pad_columns(data_filt)
    cols = data_filt.columns

    return data_filt, cols


[docs]def zero_pad_columns(data: pd.DataFrame, inplace: Optional[bool] = False) -> Optional[pd.DataFrame]:
    r"""Add zero-padding to numbers at the **end** of column names in a dataframe.

    .. warning::
        By default, this function assumes that numbers are **at the end** of column names. If you need to change that
        behavior (e.g., because the column names have string suffixes), you might need to apply zero-padding manually.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe with columns to zero-pad
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame` or ``None``
        dataframe with zero-padded columns or ``None`` if ``inplace`` is ``True``

    """
    _assert_is_dtype(data, pd.DataFrame)
    if not inplace:
        data = data.copy()

    nums = [re.findall(r"(\d+)$", c) for c in data.columns]
    nums = [c[0] if len(c) > 0 else "" for c in nums]
    if len(nums) == 0:
        return pd.DataFrame()
    zfill_num = max(max(list(map(len, nums))), 2)

    data.columns = [re.sub(r"(\d+)$", lambda m: m.group(1).zfill(zfill_num), c) for c in data.columns]

    if inplace:
        return None
    return data


[docs]def to_idx(col_idxs: Union[np.array, Sequence[int]]) -> np.ndarray:
    """Convert questionnaire item indices into array indices.

    In questionnaires, items indices start at 1. To avoid confusion in the implementation of questionnaires
    (because array indices start at 0) all questionnaire indices in BioPsyKit also start at 1 and are converted to
    0-based indexing using this function.

    Parameters
    ----------
    col_idxs : list of int
        list of indices to convert to 0-based indexing

    Returns
    -------
    :class:`~numpy.ndarray`
        array with converted indices

    """
    return np.array(col_idxs) - 1


[docs]def invert(
    data: Union[pd.DataFrame, pd.Series],
    score_range: Sequence[int],
    cols: Optional[Union[np.array, Sequence[int], Sequence[str]]] = None,
    inplace: Optional[bool] = False,
) -> Optional[Union[pd.DataFrame, pd.Series]]:
    """Invert questionnaire scores.

    In many questionnaires some items need to be inverted (reversed) before sum scores can be computed. This function
    can be used to either invert a single column (Series), selected columns in a dataframe (by specifying columns
    in the ``cols`` parameter), or a complete dataframe.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        questionnaire data to invert
    score_range : list of int
        possible score range of the questionnaire items
    cols : list of str or list of int
        list of column names or column indices
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame` or ``None``
        dataframe with inverted columns or ``None`` if ``inplace`` is ``True``


    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if ``data`` is no dataframe or series
        if ``score_range`` does not have length 2
    :exc:`~biopsykit.utils.exceptions.ValueRangeError`
        if values in ``data`` are not in ``score_range``


    Examples
    --------
    >>> from biopsykit.questionnaires.utils import invert
    >>> data_in = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 0, 1, 3], "C": [0, 3, 2, 3], "D": [0, 1, 2, 4]})
    >>> data_out = invert(data_in, score_range=[0, 4])
    >>> data_out["A"]
    >>> [3, 2, 1, 3]
    >>> data_out["B"]
    >>> [0, 4, 3, 1]
    >>> data_out["C"]
    >>> [4, 1, 2, 1]
    >>> data_out["D"]
    >>> [4, 3, 2, 0]
    >>> # Other score range
    >>> data_out = invert(data, score_range=[0, 5])
    >>> data_out["A"]
    >>> [3, 2, 1, 3]
    >>> data_out["B"]
    >>> [1, 5, 4, 2]
    >>> data_out["C"]
    >>> [5, 2, 3, 2]
    >>> data_out["D"]
    >>> [5, 4, 3, 1]
    >>> # Invert only specific columns
    >>> data_out = invert(data, score_range=[0, 4], cols=["A", "C"])
    >>> data_out["A"]
    >>> [3, 2, 1, 3]
    >>> data_out["B"]
    >>> [4, 0, 1, 3]
    >>> data_out["C"]
    >>> [4, 1, 2, 1]
    >>> data_out["D"]
    >>> [0, 1, 2, 4]

    """
    _assert_is_dtype(data, (pd.DataFrame, pd.Series))
    _assert_len_list(score_range, 2)

    if not inplace:
        data = data.copy()

    if isinstance(data, pd.DataFrame):
        data = _invert_dataframe(data, cols, score_range)
    else:
        _assert_value_range(data, score_range)
        data.iloc[:] = score_range[1] - data.iloc[:] + score_range[0]

    if inplace:
        return None
    return data


def _invert_dataframe(
    data: pd.DataFrame, cols: Union[Sequence[str], Sequence[int]], score_range: Sequence[int]
) -> pd.DataFrame:
    if cols is not None:
        if isinstance(cols[0], str):
            _assert_value_range(data[cols], score_range)
            data.loc[:, cols] = score_range[1] - data.loc[:, cols] + score_range[0]
        else:
            _assert_value_range(data.iloc[:, cols], score_range)
            data.iloc[:, cols] = score_range[1] - data.iloc[:, cols] + score_range[0]
    else:
        _assert_value_range(data, score_range)
        data.iloc[:, :] = score_range[1] - data.iloc[:, :] + score_range[0]

    return data


def _invert_subscales(
    data: pd.DataFrame,
    subscales: Dict[str, Sequence[Union[str, int]]],
    idx_dict: Dict[str, Sequence[int]],
    score_range: Sequence[int],
) -> pd.DataFrame:
    """Invert questionnaire scores from a dictionary of questionnaire subscales.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        questionnaire data to invert
    subscales : dict
        dictionary with subscale names (keys) and list of item indices or column names belonging to the
        individual subscales (values)
    idx_dict : dict
        dictionary with subscale names (keys) and indices of items that should be inverted (values)
    score_range : list of int
        possible score range of the questionnaire items

    Returns
    -------
    :class:`~pandas.DataFrame` or ``None``
        dataframe with inverted columns

    See Also
    --------
    invert : invert scores of questionnaire columns

    """
    _assert_is_dtype(data, pd.DataFrame)

    for scale_name, idxs in idx_dict.items():
        if scale_name in subscales:
            data = invert(data, cols=to_idx(np.array(subscales[scale_name])[idxs]), score_range=score_range)
    return data


[docs]def convert_scale(
    data: Union[pd.DataFrame, pd.Series],
    offset: int,
    cols: Optional[Union[pd.DataFrame, pd.Series]] = None,
    inplace: Optional[bool] = False,
) -> Optional[Union[pd.DataFrame, pd.Series]]:
    """Convert the score range of questionnaire items.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        questionnaire data to invert
    offset : int
        offset to add to questionnaire items
    cols : list of str or list of int
        list of column names or column indices
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame`, :class:`~pandas.Series`, or ``None``
        dataframe with converted columns or ``None`` if ``inplace`` is ``True``


    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if ``data`` is no dataframe or series

    Examples
    --------
    >>> from biopsykit.questionnaires.utils import convert_scale
    >>> data_in = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 0, 1, 3], "C": [0, 3, 2, 3], "D": [0, 1, 2, 4]})
    >>> # convert data from range [0, 4] to range [1, 5]
    >>> data_out = convert_scale(data_in, offset=1)
    >>> data_out["A"]
    >>> [2, 3, 4, 2]
    >>> data_out["B"]
    >>> [5, 1, 2, 4]
    >>> data_out["C"]
    >>> [1, 4, 3, 4]
    >>> data_out["D"]
    >>> [1, 2, 3, 5]
    >>> data_in = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 2, 1, 3], "C": [3, 3, 2, 3], "D": [4, 1, 2, 4]})
    >>> # convert data from range [1, 4] to range [0, 3]
    >>> data_out = convert_scale(data_in, offset=-1)
    >>> print(data_out)
    >>> # convert only specific columns
    >>> data_out = convert_scale(data_in, offset=-1, columns=["A", "C"])
    >>> print(data_out)

    """
    _assert_is_dtype(data, (pd.DataFrame, pd.Series))

    if not inplace:
        data = data.copy()

    if isinstance(data, pd.DataFrame):
        data = _convert_scale_dataframe(data, cols, offset)
    else:
        data.iloc[:] = data.iloc[:] + offset

    if inplace:
        return None
    return data


def _convert_scale_dataframe(
    data: pd.DataFrame, cols: Union[Sequence[int], Sequence[str]], offset: int
) -> pd.DataFrame:
    if cols is None:
        data.iloc[:, :] = data.iloc[:, :] + offset
    elif isinstance(cols[0], int):
        data.iloc[:, cols] = data.iloc[:, cols] + offset
    elif isinstance(cols[0], str):
        data.loc[:, cols] = data.loc[:, cols] + offset

    return data


[docs]def crop_scale(
    data: Union[pd.DataFrame, pd.Series],
    score_range: Sequence[int],
    set_nan: Optional[bool] = False,
    inplace: Optional[bool] = False,
) -> Optional[Union[pd.DataFrame, pd.Series]]:
    """Crop questionnaire scales, i.e., set values out of range to specific minimum and maximum values or to NaN.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        data to be cropped
    score_range : list of int
        possible score range of the questionnaire items. Values out of ``score_range`` are cropped.
    set_nan : bool, optional
        whether to set values out of range to NaN or to the values specified by ``score_range``. Default: ``False``
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame`, :class:`~pandas.Series`, or ``None``
        dataframe (or series) with cropped scales or ``None`` if ``inplace`` is ``True``

    """
    _assert_is_dtype(data, (pd.DataFrame, pd.Series))
    _assert_len_list(score_range, 2)

    if not inplace:
        data = data.copy()

    if set_nan:
        data.mask((data < score_range[0]) | (data > score_range[1]), inplace=True)  # noqa: PD002
    else:
        data.mask((data < score_range[0]), other=score_range[0], inplace=True)  # noqa: PD002
        data.mask((data > score_range[1]), other=score_range[1], inplace=True)  # noqa: PD002

    if inplace:
        return None
    return data


[docs]def bin_scale(
    data: Union[pd.DataFrame, pd.Series],
    bins: Union[int, Sequence[float], pd.IntervalIndex],
    cols: Optional[Union[Sequence[Union[int, str]], Union[int, str]]] = None,
    first_min: Optional[bool] = True,
    last_max: Optional[bool] = False,
    inplace: Optional[bool] = False,
    **kwargs,
) -> Optional[Union[pd.Series, pd.DataFrame]]:
    """Bin questionnaire scales.

    Questionnaire scales are binned using :func:`pandas.cut` according to the bins specified by ``bins``.


    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        data with scales to be binned
    bins : int or list of float or :class:`~pandas.IntervalIndex``
        The criteria to bin by. ``bins`` can have one of the following types:

        * ``int`` : Defines the number of equal-width bins in the range of ``data``. The range of ``data`` is extended
          by 0.1% on each side to include the minimum and maximum values of ``data``.
        * sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of
          ``data`` is done.
        * :class:`~pandas.IntervalIndex` : Defines the exact bins to be used. Note that the ``IntervalIndex`` for
          ``bins`` must be non-overlapping.

    cols : list of str or list of int, optional
        column name/index (or list of such) to be binned or ``None`` to use all columns (or if ``data`` is a series).
        Default: ``None``
    first_min : bool, optional
        whether the minimum value should be added as the leftmost edge of the last bin or not.
        Only considered if ``bins`` is a list. Default: ``False``
    last_max : bool, optional
        whether the maximum value should be added as the rightmost edge of the last bin or not.
        Only considered if ``bins`` is a list. Default: ``False``
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``
    **kwargs
        additional parameters that are passed to :func:`pandas.cut`


    Returns
    -------
    :class:`~pandas.DataFrame`, :class:`~pandas.Series`, or ``None``
        dataframe (or series) with binned scales or ``None`` if ``inplace`` is ``True``


    See Also
    --------
    :func:`pandas.cut`
        Pandas method to bin values into discrete intervals.

    """
    _assert_is_dtype(data, (pd.Series, pd.DataFrame))

    if not inplace:
        data = data.copy()

    # set "labels" argument to False, but only if is wasn't specified by the user yet
    kwargs["labels"] = kwargs.get("labels", False)
    if isinstance(data, pd.Series):
        bins_c = _get_bins(data, bins, None, first_min, last_max)
        c = pd.cut(data.iloc[:], bins=bins_c, **kwargs)
        data.iloc[:] = c
        return data

    cols = _get_cols(data, cols)
    for col in cols:
        bins_c = _get_bins(data, bins, col, first_min, last_max)
        if isinstance(col, int):
            c = pd.cut(data.iloc[:, col], bins=bins_c, **kwargs)
            data.iloc[:, col] = c
        else:
            c = pd.cut(data.loc[:, col], bins=bins_c, **kwargs)
            data.loc[:, col] = c

    if inplace:
        return None
    return data


[docs]def wide_to_long(data: pd.DataFrame, quest_name: str, levels: Union[str, Sequence[str]]) -> pd.DataFrame:
    """Convert a dataframe wide-format into long-format.

    .. warning::
        This function is deprecated and will be removed in the future!
        Please use :func:`~biopsykit.utils.dataframe_handling.wide_to_long` instead.


    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        pandas DataFrame containing saliva data in wide-format, i.e. one column per saliva sample, one row per subject.
    quest_name : str
        questionnaire name, i.e., common name for each column to be converted into long-format.
    levels : str or list of str
        index levels of the resulting long-format dataframe.


    Returns
    -------
    :class:`~pandas.DataFrame`
        pandas DataFrame in long-format

    See Also
    --------
    :func:`~biopsykit.utils.dataframe_handling.wide_to_long`
        convert dataframe from wide to long format

    """
    warnings.warn(
        "'biopsykit.questionnaires.utils.wide_to_long()' is deprecated! "
        "Please update your code to use 'biopsykit.utils.dataframe_handling.wide_to_long()' in the future.",
        category=DeprecationWarning,
    )
    return wide_to_long_utils(data=data, stubname=quest_name, levels=levels)


[docs]def compute_scores(
    data: pd.DataFrame,
    quest_dict: Dict[str, Union[Sequence[str], pd.Index]],
    quest_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
) -> pd.DataFrame:
    """Compute questionnaire scores from dataframe.

    This function can be used if multiple questionnaires from a dataframe should be computed at once. If the same
    questionnaire was assessed at multiple time points, these scores will be computed separately
    (see ``Notes`` and ``Examples``).
    The questionnaires (and the dataframe columns belonging to the questionnaires) are specified by ``quest_dict``.

    .. note::
        If questionnaires were collected at different time points (e.g., `pre` and `post`), which should all be
        computed, then the dictionary keys need to have the following format: "<questionnaire_name>-<time_point>".

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe containing questionnaire data
    quest_dict : dict
        dictionary with questionnaire names to be computed (keys) and columns of the questionnaires (values)
    quest_kwargs : dict
        dictionary with optional arguments to be passed to questionnaire functions. The dictionary is expected
        consist of questionnaire names (keys) and ``**kwargs`` dictionaries (values) with arguments per questionnaire

    Returns
    -------
    :class:`~pandas.DataFrame`
        dataframe with computed questionnaire scores

    Examples
    --------
    >>> from biopsykit.questionnaires.utils import compute_scores
    >>> quest_dict = {
    >>>     "PSS": ["PSS_{:02d}".format(i) for i in range(1, 11)], # PSS: one time point
    >>>     "PASA-pre": ["PASA_{:02d}_T0".format(i) for i in range(1, 17)], # PASA: two time points (pre and post)
    >>>     "PASA-post": ["PASA_{:02d}_T1".format(i) for i in range(1, 17)], # PASA: two time points (pre and post)
    >>> }
    >>> compute_scores(data, quest_dict)

    """
    from biopsykit.questionnaires import questionnaires  # pylint:disable=import-outside-toplevel

    _assert_is_dtype(data, pd.DataFrame)

    df_scores = pd.DataFrame(index=data.index)
    quest_funcs = dict(getmembers(questionnaires, isfunction))
    if quest_kwargs is None:
        quest_kwargs = {}

    for score, columns in quest_dict.items():
        score_orig = score
        score = score.lower()  # noqa: PLW2901
        suffix = None
        if "-" in score:
            score_split = score.split("-")
            score = score_split[0]  # noqa: PLW2901
            suffix = score_split[1]

        if score not in quest_funcs:
            raise ValueError(
                "Unknown questionnaire '{}'! Call "
                "'biopsykit.questionnaires.utils.get_supported_questionnaires()' "
                "to get a list of all supported questionnaires.".format(score)
            )
        kwargs = quest_kwargs.get(score_orig, {})
        try:
            df = quest_funcs[score](data[columns], **kwargs)
        except TypeError as e:
            raise TypeError(
                "Error computing questionnaire '{}'. The computation failed with the following "
                "error: \n\n{}.".format(score, str(e))
            ) from e

        if suffix is not None:
            df.columns = [f"{col}_{suffix}" for col in df.columns]
        df_scores = df_scores.join(df)

    return df_scores


[docs]def get_supported_questionnaires() -> Dict[str, str]:
    """List all supported (i.e., implemented) questionnaires.

    Returns
    -------
    dict
        dictionary with questionnaire names (keys) and description (values)

    """
    from biopsykit.questionnaires import questionnaires  # pylint:disable=import-outside-toplevel

    funcs = dict(getmembers(questionnaires, isfunction))
    quests = {}
    for key, value in funcs.items():
        if key.startswith("_"):
            continue
        summary = value.__doc__.split("\n")[0]
        summary = re.findall(r"\*\*(.*)\*\*.", summary)
        if len(summary) == 0:
            continue
        quests[key] = summary[0]

    return quests


def _compute_questionnaire_subscales(
    data: pd.DataFrame,
    score_name: str,
    subscales: Dict[str, Sequence[Union[str, int]]],
    agg_type: Optional[Literal["sum", "mean"]] = "sum",
) -> Dict[str, pd.Series]:
    """Compute questionnaire subscales (helper function).

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe containing questionnaire data
    score_name : str
        name of the questionnaire
    subscales : dict
        dictionary with subscales to be computed. Keys are subscale names, values are the indices of the items
        belonging to the subscales
    agg_type : str
        whether to compute a ``sum`` or a ``mean`` score. Default: ``sum``

    Returns
    -------
    dict
        dictionary with computed subscales

    """
    _assert_is_dtype(data, pd.DataFrame)

    out = {}
    for key, items in subscales.items():
        if all(np.issubdtype(type(i), np.integer) for i in items):
            # assume column indices, starting at 1 (-> convert to 0-indexed indices first)
            score = _compute_questionnaire_scores_int(data, items, agg_type)
        elif all(isinstance(i, str) for i in items):
            # assume column names
            score = _compute_questionnaire_scores_str(data, items, agg_type)
        else:
            raise ValueError(
                "Subscale columns are either expected as column names (list of strings) or "
                "column indices (list of integers)!"
            )

        out[f"{score_name}_{key}"] = score

    return out


def _compute_questionnaire_scores_int(data: pd.DataFrame, items: Sequence[int], agg_type: str):
    if agg_type == "sum":
        return data.iloc[:, to_idx(items)].sum(axis=1)
    return data.iloc[:, to_idx(items)].mean(axis=1)


def _compute_questionnaire_scores_str(data: pd.DataFrame, items: Sequence[str], agg_type: str):
    if agg_type == "sum":
        return data.loc[:, items].sum(axis=1)
    return data.loc[:, items].mean(axis=1)


def _get_cols(
    data: pd.DataFrame, cols: Optional[Union[Sequence[Union[int, str]], Union[int, str]]] = None
) -> Sequence[Union[str, int]]:
    if isinstance(cols, int):
        cols = [cols]
    if isinstance(cols, str):
        cols = [cols]
    if cols is None:
        cols = list(data.columns)
    return cols


def _get_bins(
    data: Union[pd.DataFrame, pd.Series],
    bins: Union[int, Sequence[float]],
    col: Optional[Union[int, str]] = None,
    first_min: Optional[bool] = False,
    last_max: Optional[bool] = False,
) -> Union[int, Sequence[float]]:

    if isinstance(bins, (int, pd.IntervalIndex)):
        return bins

    # ensure list
    bins = list(bins)

    if first_min:
        min_val = _bin_scale_get_min_val(data, col)
        if min_val < min(bins):
            bins = [min_val - 0.01, *bins]

    if last_max:
        max_val = _bin_scale_get_max_val(data, col)
        if max_val > max(bins):
            bins = [*bins, max_val + 0.01]

    return bins


def _bin_scale_get_min_val(data: Union[pd.DataFrame, pd.Series], col: Union[int, str]) -> float:
    if isinstance(col, int):
        return data.iloc[:, col].min()
    if isinstance(col, str):
        return data[col].min()
    return data.min()


def _bin_scale_get_max_val(data: Union[pd.DataFrame, pd.Series], col: Union[int, str]) -> float:
    if isinstance(col, int):
        return data.iloc[:, col].max()
    if isinstance(col, str):
        return data[col].max()
    return data.max()