"""Module containing utility functions for manipulating and processing questionnaire data."""
import re
import warnings
from inspect import getmembers, isfunction
from typing import Any, Dict, Literal, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd
from biopsykit.utils._datatype_validation_helper import _assert_is_dtype, _assert_len_list, _assert_value_range
from biopsykit.utils.dataframe_handling import wide_to_long as wide_to_long_utils
__all__ = [
"bin_scale",
"compute_scores",
"crop_scale",
"convert_scale",
"find_cols",
"zero_pad_columns",
"invert",
"to_idx",
"wide_to_long",
"get_supported_questionnaires",
]
[docs]def find_cols(
data: pd.DataFrame,
regex_str: Optional[str] = None,
starts_with: Optional[str] = None,
ends_with: Optional[str] = None,
contains: Optional[str] = None,
zero_pad_numbers: Optional[bool] = True,
) -> Tuple[pd.DataFrame, Sequence[str]]:
r"""Find columns in dataframe that match a specific pattern.
This function is useful to find all columns that belong to a questionnaire. Column names can be filtered based on
one (or a combination of) the following criteria:
* ``starts_with``: columns have to start with the specified string
* ``ends_with``: columns have to end with the specified string
* ``contains``: columns have to contain the specified string
Optionally, the item numbers in the matching column names can be zero-padded, if they are not already.
.. note::
If ``zero_pad_numbers`` is ``True`` then the column names returned by this function will be renamed and might
thus not match the column names of the original dataframe. To solve this, make sure your orignal dataframe
already has zero-padded columns (by manually renaming them) or convert column names using
:func:`~biopsykit.questionnaires.utils.zero_pad_columns`.
.. warning::
Zero-padding using :func:`~biopsykit.questionnaires.utils.zero_pad_columns` assumes, by default, that numbers
are *at the end* of column names. If you want to change that behavior
(e.g., because the column names have string suffixes), you might need to apply zero-padding manually.
Parameters
----------
data : :class:`~pandas.DataFrame`
dataframe with columns to be filtered
regex_str : str, optional
regex string to extract column names. If this parameter is passed the other parameters (``starts_with``,
``ends_with``, ``contains``) will be ignored. Default: ``None``
starts_with : str, optional
string columns have to start with. Default: ``None``
ends_with : str, optional
string columns have to end with. Default: ``None``
contains : str, optional
string columns have to contain. Default: ``None``
zero_pad_numbers : bool, optional
whether to zero-pad numbers in column names. Default: ``True``
Returns
-------
data_filt : :class:`~pandas.DataFrame`
dataframe with filtered columns that match the specified pattern
cols : :class:`~pandas.Index`
columns that match the specified pattern
Examples
--------
>>> import biopsykit as bp
>>> import pandas as pd
>>> # Option 1: has to start with "XX"
>>> data = pd.DataFrame(columns=["XX_{}".format(i) for i in range(1, 11)])
>>> df, cols = bp.questionnaires.utils.find_cols(data, starts_with="XX")
>>> print(cols)
>>> ["XX_01", "XX_02", ..., "XX_10"]
>>> # Option 2: has to end with "Post"
>>> data = pd.DataFrame(columns=["XX_1_Pre", "XX_2_Pre", "XX_3_Pre", "XX_1_Post", "XX_2_Post", "XX_3_Post"])
>>> df, cols = bp.questionnaires.utils.find_cols(data, ends_with="Post")
>>> print(cols)
>>> ["XX_01_Post", "XX_02_Post", "XX_03_Post"]
>>> # Option 3: has to start with "XX" and end with "Post"
>>> data = pd.DataFrame(columns=["XX_1_Pre", "XX_2_Pre", "XX_3_Pre", "XX_1_Post", "XX_2_Post", "XX_3_Post",
"YY_1_Pre", "YY_2_Pre", "YY_1_Post", "YY_2_Post"])
>>> bp.questionnaires.utils.find_cols(data, starts_with="XX", ends_with="Post")
>>> print(cols)
>>> # WARNING: this will not zero-pad the questionnaire numbers!
>>> ["XX_1_Post", "XX_2_Post", "XX_3_Post"]
>>> # Option 4: pass custom regex string
>>> data = pd.DataFrame(columns=["XX_1_Pre", "XX_2_Pre", "XX_3_Pre", "XX_1_Post", "XX_2_Post", "XX_3_Post",
"YY_1_Pre", "YY_2_Pre", "YY_1_Post", "YY_2_Post"])
>>> bp.questionnaires.utils.find_cols(data, regex_str=r"XX_\d+_\w+")
>>> print(cols)
>>> # here, zero-padding will be possible again
>>> ["XX_01_Post", "XX_02_Post", "XX_03_Post"]
>>> # Option 5: disable zero-padding
>>> data = pd.DataFrame(columns=["XX_{}".format(i) for i in range(1, 11)])
>>> df, cols = bp.questionnaires.utils.find_cols(data, starts_with="XX", zero_pad_numbers=False)
>>> print(cols)
>>> ["XX_1", "XX_2", ..., "XX_10"]
"""
_assert_is_dtype(data, pd.DataFrame)
data_filt = data.copy()
if regex_str:
data_filt = data_filt.filter(regex=regex_str)
else:
if starts_with:
data_filt = data_filt.filter(regex="^" + starts_with)
if ends_with:
data_filt = data_filt.filter(regex=ends_with + "$")
if contains:
data_filt = data_filt.filter(regex=contains)
if zero_pad_numbers:
data_filt = zero_pad_columns(data_filt)
cols = data_filt.columns
return data_filt, cols
[docs]def zero_pad_columns(data: pd.DataFrame, inplace: Optional[bool] = False) -> Optional[pd.DataFrame]:
r"""Add zero-padding to numbers at the **end** of column names in a dataframe.
.. warning::
By default, this function assumes that numbers are **at the end** of column names. If you need to change that
behavior (e.g., because the column names have string suffixes), you might need to apply zero-padding manually.
Parameters
----------
data : :class:`~pandas.DataFrame`
dataframe with columns to zero-pad
inplace : bool, optional
whether to perform the operation inplace or not. Default: ``False``
Returns
-------
:class:`~pandas.DataFrame` or ``None``
dataframe with zero-padded columns or ``None`` if ``inplace`` is ``True``
"""
_assert_is_dtype(data, pd.DataFrame)
if not inplace:
data = data.copy()
nums = [re.findall(r"(\d+)$", c) for c in data.columns]
nums = [c[0] if len(c) > 0 else "" for c in nums]
if len(nums) == 0:
return pd.DataFrame()
zfill_num = max(max(list(map(len, nums))), 2)
data.columns = [re.sub(r"(\d+)$", lambda m: m.group(1).zfill(zfill_num), c) for c in data.columns]
if inplace:
return None
return data
[docs]def to_idx(col_idxs: Union[np.array, Sequence[int]]) -> np.ndarray:
"""Convert questionnaire item indices into array indices.
In questionnaires, items indices start at 1. To avoid confusion in the implementation of questionnaires
(because array indices start at 0) all questionnaire indices in BioPsyKit also start at 1 and are converted to
0-based indexing using this function.
Parameters
----------
col_idxs : list of int
list of indices to convert to 0-based indexing
Returns
-------
:class:`~numpy.ndarray`
array with converted indices
"""
return np.array(col_idxs) - 1
[docs]def invert(
data: Union[pd.DataFrame, pd.Series],
score_range: Sequence[int],
cols: Optional[Union[np.array, Sequence[int], Sequence[str]]] = None,
inplace: Optional[bool] = False,
) -> Optional[Union[pd.DataFrame, pd.Series]]:
"""Invert questionnaire scores.
In many questionnaires some items need to be inverted (reversed) before sum scores can be computed. This function
can be used to either invert a single column (Series), selected columns in a dataframe (by specifying columns
in the ``cols`` parameter), or a complete dataframe.
Parameters
----------
data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
questionnaire data to invert
score_range : list of int
possible score range of the questionnaire items
cols : list of str or list of int
list of column names or column indices
inplace : bool, optional
whether to perform the operation inplace or not. Default: ``False``
Returns
-------
:class:`~pandas.DataFrame` or ``None``
dataframe with inverted columns or ``None`` if ``inplace`` is ``True``
Raises
------
:exc:`~biopsykit.utils.exceptions.ValidationError`
if ``data`` is no dataframe or series
if ``score_range`` does not have length 2
:exc:`~biopsykit.utils.exceptions.ValueRangeError`
if values in ``data`` are not in ``score_range``
Examples
--------
>>> from biopsykit.questionnaires.utils import invert
>>> data_in = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 0, 1, 3], "C": [0, 3, 2, 3], "D": [0, 1, 2, 4]})
>>> data_out = invert(data_in, score_range=[0, 4])
>>> data_out["A"]
>>> [3, 2, 1, 3]
>>> data_out["B"]
>>> [0, 4, 3, 1]
>>> data_out["C"]
>>> [4, 1, 2, 1]
>>> data_out["D"]
>>> [4, 3, 2, 0]
>>> # Other score range
>>> data_out = invert(data, score_range=[0, 5])
>>> data_out["A"]
>>> [3, 2, 1, 3]
>>> data_out["B"]
>>> [1, 5, 4, 2]
>>> data_out["C"]
>>> [5, 2, 3, 2]
>>> data_out["D"]
>>> [5, 4, 3, 1]
>>> # Invert only specific columns
>>> data_out = invert(data, score_range=[0, 4], cols=["A", "C"])
>>> data_out["A"]
>>> [3, 2, 1, 3]
>>> data_out["B"]
>>> [4, 0, 1, 3]
>>> data_out["C"]
>>> [4, 1, 2, 1]
>>> data_out["D"]
>>> [0, 1, 2, 4]
"""
_assert_is_dtype(data, (pd.DataFrame, pd.Series))
_assert_len_list(score_range, 2)
if not inplace:
data = data.copy()
if isinstance(data, pd.DataFrame):
data = _invert_dataframe(data, cols, score_range)
else:
_assert_value_range(data, score_range)
data.iloc[:] = score_range[1] - data.iloc[:] + score_range[0]
if inplace:
return None
return data
def _invert_dataframe(
data: pd.DataFrame, cols: Union[Sequence[str], Sequence[int]], score_range: Sequence[int]
) -> pd.DataFrame:
if cols is not None:
if isinstance(cols[0], str):
_assert_value_range(data[cols], score_range)
data.loc[:, cols] = score_range[1] - data.loc[:, cols] + score_range[0]
else:
_assert_value_range(data.iloc[:, cols], score_range)
data.iloc[:, cols] = score_range[1] - data.iloc[:, cols] + score_range[0]
else:
_assert_value_range(data, score_range)
data.iloc[:, :] = score_range[1] - data.iloc[:, :] + score_range[0]
return data
def _invert_subscales(
data: pd.DataFrame,
subscales: Dict[str, Sequence[Union[str, int]]],
idx_dict: Dict[str, Sequence[int]],
score_range: Sequence[int],
) -> pd.DataFrame:
"""Invert questionnaire scores from a dictionary of questionnaire subscales.
Parameters
----------
data : :class:`~pandas.DataFrame`
questionnaire data to invert
subscales : dict
dictionary with subscale names (keys) and list of item indices or column names belonging to the
individual subscales (values)
idx_dict : dict
dictionary with subscale names (keys) and indices of items that should be inverted (values)
score_range : list of int
possible score range of the questionnaire items
Returns
-------
:class:`~pandas.DataFrame` or ``None``
dataframe with inverted columns
See Also
--------
invert : invert scores of questionnaire columns
"""
_assert_is_dtype(data, pd.DataFrame)
for scale_name, idxs in idx_dict.items():
if scale_name in subscales:
data = invert(data, cols=to_idx(np.array(subscales[scale_name])[idxs]), score_range=score_range)
return data
[docs]def convert_scale(
data: Union[pd.DataFrame, pd.Series],
offset: int,
cols: Optional[Union[pd.DataFrame, pd.Series]] = None,
inplace: Optional[bool] = False,
) -> Optional[Union[pd.DataFrame, pd.Series]]:
"""Convert the score range of questionnaire items.
Parameters
----------
data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
questionnaire data to invert
offset : int
offset to add to questionnaire items
cols : list of str or list of int
list of column names or column indices
inplace : bool, optional
whether to perform the operation inplace or not. Default: ``False``
Returns
-------
:class:`~pandas.DataFrame`, :class:`~pandas.Series`, or ``None``
dataframe with converted columns or ``None`` if ``inplace`` is ``True``
Raises
------
:exc:`~biopsykit.utils.exceptions.ValidationError`
if ``data`` is no dataframe or series
Examples
--------
>>> from biopsykit.questionnaires.utils import convert_scale
>>> data_in = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 0, 1, 3], "C": [0, 3, 2, 3], "D": [0, 1, 2, 4]})
>>> # convert data from range [0, 4] to range [1, 5]
>>> data_out = convert_scale(data_in, offset=1)
>>> data_out["A"]
>>> [2, 3, 4, 2]
>>> data_out["B"]
>>> [5, 1, 2, 4]
>>> data_out["C"]
>>> [1, 4, 3, 4]
>>> data_out["D"]
>>> [1, 2, 3, 5]
>>> data_in = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 2, 1, 3], "C": [3, 3, 2, 3], "D": [4, 1, 2, 4]})
>>> # convert data from range [1, 4] to range [0, 3]
>>> data_out = convert_scale(data_in, offset=-1)
>>> print(data_out)
>>> # convert only specific columns
>>> data_out = convert_scale(data_in, offset=-1, columns=["A", "C"])
>>> print(data_out)
"""
_assert_is_dtype(data, (pd.DataFrame, pd.Series))
if not inplace:
data = data.copy()
if isinstance(data, pd.DataFrame):
data = _convert_scale_dataframe(data, cols, offset)
else:
data.iloc[:] = data.iloc[:] + offset
if inplace:
return None
return data
def _convert_scale_dataframe(
data: pd.DataFrame, cols: Union[Sequence[int], Sequence[str]], offset: int
) -> pd.DataFrame:
if cols is None:
data.iloc[:, :] = data.iloc[:, :] + offset
elif isinstance(cols[0], int):
data.iloc[:, cols] = data.iloc[:, cols] + offset
elif isinstance(cols[0], str):
data.loc[:, cols] = data.loc[:, cols] + offset
return data
[docs]def crop_scale(
data: Union[pd.DataFrame, pd.Series],
score_range: Sequence[int],
set_nan: Optional[bool] = False,
inplace: Optional[bool] = False,
) -> Optional[Union[pd.DataFrame, pd.Series]]:
"""Crop questionnaire scales, i.e., set values out of range to specific minimum and maximum values or to NaN.
Parameters
----------
data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
data to be cropped
score_range : list of int
possible score range of the questionnaire items. Values out of ``score_range`` are cropped.
set_nan : bool, optional
whether to set values out of range to NaN or to the values specified by ``score_range``. Default: ``False``
inplace : bool, optional
whether to perform the operation inplace or not. Default: ``False``
Returns
-------
:class:`~pandas.DataFrame`, :class:`~pandas.Series`, or ``None``
dataframe (or series) with cropped scales or ``None`` if ``inplace`` is ``True``
"""
_assert_is_dtype(data, (pd.DataFrame, pd.Series))
_assert_len_list(score_range, 2)
if not inplace:
data = data.copy()
if set_nan:
data.mask((data < score_range[0]) | (data > score_range[1]), inplace=True) # noqa: PD002
else:
data.mask((data < score_range[0]), other=score_range[0], inplace=True) # noqa: PD002
data.mask((data > score_range[1]), other=score_range[1], inplace=True) # noqa: PD002
if inplace:
return None
return data
[docs]def bin_scale(
data: Union[pd.DataFrame, pd.Series],
bins: Union[int, Sequence[float], pd.IntervalIndex],
cols: Optional[Union[Sequence[Union[int, str]], Union[int, str]]] = None,
first_min: Optional[bool] = True,
last_max: Optional[bool] = False,
inplace: Optional[bool] = False,
**kwargs,
) -> Optional[Union[pd.Series, pd.DataFrame]]:
"""Bin questionnaire scales.
Questionnaire scales are binned using :func:`pandas.cut` according to the bins specified by ``bins``.
Parameters
----------
data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
data with scales to be binned
bins : int or list of float or :class:`~pandas.IntervalIndex``
The criteria to bin by. ``bins`` can have one of the following types:
* ``int`` : Defines the number of equal-width bins in the range of ``data``. The range of ``data`` is extended
by 0.1% on each side to include the minimum and maximum values of ``data``.
* sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of
``data`` is done.
* :class:`~pandas.IntervalIndex` : Defines the exact bins to be used. Note that the ``IntervalIndex`` for
``bins`` must be non-overlapping.
cols : list of str or list of int, optional
column name/index (or list of such) to be binned or ``None`` to use all columns (or if ``data`` is a series).
Default: ``None``
first_min : bool, optional
whether the minimum value should be added as the leftmost edge of the last bin or not.
Only considered if ``bins`` is a list. Default: ``False``
last_max : bool, optional
whether the maximum value should be added as the rightmost edge of the last bin or not.
Only considered if ``bins`` is a list. Default: ``False``
inplace : bool, optional
whether to perform the operation inplace or not. Default: ``False``
**kwargs
additional parameters that are passed to :func:`pandas.cut`
Returns
-------
:class:`~pandas.DataFrame`, :class:`~pandas.Series`, or ``None``
dataframe (or series) with binned scales or ``None`` if ``inplace`` is ``True``
See Also
--------
:func:`pandas.cut`
Pandas method to bin values into discrete intervals.
"""
_assert_is_dtype(data, (pd.Series, pd.DataFrame))
if not inplace:
data = data.copy()
# set "labels" argument to False, but only if is wasn't specified by the user yet
kwargs["labels"] = kwargs.get("labels", False)
if isinstance(data, pd.Series):
bins_c = _get_bins(data, bins, None, first_min, last_max)
c = pd.cut(data.iloc[:], bins=bins_c, **kwargs)
data.iloc[:] = c
return data
cols = _get_cols(data, cols)
for col in cols:
bins_c = _get_bins(data, bins, col, first_min, last_max)
if isinstance(col, int):
c = pd.cut(data.iloc[:, col], bins=bins_c, **kwargs)
data.iloc[:, col] = c
else:
c = pd.cut(data.loc[:, col], bins=bins_c, **kwargs)
data.loc[:, col] = c
if inplace:
return None
return data
[docs]def wide_to_long(data: pd.DataFrame, quest_name: str, levels: Union[str, Sequence[str]]) -> pd.DataFrame:
"""Convert a dataframe wide-format into long-format.
.. warning::
This function is deprecated and will be removed in the future!
Please use :func:`~biopsykit.utils.dataframe_handling.wide_to_long` instead.
Parameters
----------
data : :class:`~pandas.DataFrame`
pandas DataFrame containing saliva data in wide-format, i.e. one column per saliva sample, one row per subject.
quest_name : str
questionnaire name, i.e., common name for each column to be converted into long-format.
levels : str or list of str
index levels of the resulting long-format dataframe.
Returns
-------
:class:`~pandas.DataFrame`
pandas DataFrame in long-format
See Also
--------
:func:`~biopsykit.utils.dataframe_handling.wide_to_long`
convert dataframe from wide to long format
"""
warnings.warn(
"'biopsykit.questionnaires.utils.wide_to_long()' is deprecated! "
"Please update your code to use 'biopsykit.utils.dataframe_handling.wide_to_long()' in the future.",
category=DeprecationWarning,
)
return wide_to_long_utils(data=data, stubname=quest_name, levels=levels)
[docs]def compute_scores(
data: pd.DataFrame,
quest_dict: Dict[str, Union[Sequence[str], pd.Index]],
quest_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
) -> pd.DataFrame:
"""Compute questionnaire scores from dataframe.
This function can be used if multiple questionnaires from a dataframe should be computed at once. If the same
questionnaire was assessed at multiple time points, these scores will be computed separately
(see ``Notes`` and ``Examples``).
The questionnaires (and the dataframe columns belonging to the questionnaires) are specified by ``quest_dict``.
.. note::
If questionnaires were collected at different time points (e.g., `pre` and `post`), which should all be
computed, then the dictionary keys need to have the following format: "<questionnaire_name>-<time_point>".
Parameters
----------
data : :class:`~pandas.DataFrame`
dataframe containing questionnaire data
quest_dict : dict
dictionary with questionnaire names to be computed (keys) and columns of the questionnaires (values)
quest_kwargs : dict
dictionary with optional arguments to be passed to questionnaire functions. The dictionary is expected
consist of questionnaire names (keys) and ``**kwargs`` dictionaries (values) with arguments per questionnaire
Returns
-------
:class:`~pandas.DataFrame`
dataframe with computed questionnaire scores
Examples
--------
>>> from biopsykit.questionnaires.utils import compute_scores
>>> quest_dict = {
>>> "PSS": ["PSS_{:02d}".format(i) for i in range(1, 11)], # PSS: one time point
>>> "PASA-pre": ["PASA_{:02d}_T0".format(i) for i in range(1, 17)], # PASA: two time points (pre and post)
>>> "PASA-post": ["PASA_{:02d}_T1".format(i) for i in range(1, 17)], # PASA: two time points (pre and post)
>>> }
>>> compute_scores(data, quest_dict)
"""
from biopsykit.questionnaires import questionnaires # pylint:disable=import-outside-toplevel
_assert_is_dtype(data, pd.DataFrame)
df_scores = pd.DataFrame(index=data.index)
quest_funcs = dict(getmembers(questionnaires, isfunction))
if quest_kwargs is None:
quest_kwargs = {}
for score, columns in quest_dict.items():
score_orig = score
score = score.lower() # noqa: PLW2901
suffix = None
if "-" in score:
score_split = score.split("-")
score = score_split[0] # noqa: PLW2901
suffix = score_split[1]
if score not in quest_funcs:
raise ValueError(
"Unknown questionnaire '{}'! Call "
"'biopsykit.questionnaires.utils.get_supported_questionnaires()' "
"to get a list of all supported questionnaires.".format(score)
)
kwargs = quest_kwargs.get(score_orig, {})
try:
df = quest_funcs[score](data[columns], **kwargs)
except TypeError as e:
raise TypeError(
"Error computing questionnaire '{}'. The computation failed with the following "
"error: \n\n{}.".format(score, str(e))
) from e
if suffix is not None:
df.columns = [f"{col}_{suffix}" for col in df.columns]
df_scores = df_scores.join(df)
return df_scores
[docs]def get_supported_questionnaires() -> Dict[str, str]:
"""List all supported (i.e., implemented) questionnaires.
Returns
-------
dict
dictionary with questionnaire names (keys) and description (values)
"""
from biopsykit.questionnaires import questionnaires # pylint:disable=import-outside-toplevel
funcs = dict(getmembers(questionnaires, isfunction))
quests = {}
for key, value in funcs.items():
if key.startswith("_"):
continue
summary = value.__doc__.split("\n")[0]
summary = re.findall(r"\*\*(.*)\*\*.", summary)
if len(summary) == 0:
continue
quests[key] = summary[0]
return quests
def _compute_questionnaire_subscales(
data: pd.DataFrame,
score_name: str,
subscales: Dict[str, Sequence[Union[str, int]]],
agg_type: Optional[Literal["sum", "mean"]] = "sum",
) -> Dict[str, pd.Series]:
"""Compute questionnaire subscales (helper function).
Parameters
----------
data : :class:`~pandas.DataFrame`
dataframe containing questionnaire data
score_name : str
name of the questionnaire
subscales : dict
dictionary with subscales to be computed. Keys are subscale names, values are the indices of the items
belonging to the subscales
agg_type : str
whether to compute a ``sum`` or a ``mean`` score. Default: ``sum``
Returns
-------
dict
dictionary with computed subscales
"""
_assert_is_dtype(data, pd.DataFrame)
out = {}
for key, items in subscales.items():
if all(np.issubdtype(type(i), np.integer) for i in items):
# assume column indices, starting at 1 (-> convert to 0-indexed indices first)
score = _compute_questionnaire_scores_int(data, items, agg_type)
elif all(isinstance(i, str) for i in items):
# assume column names
score = _compute_questionnaire_scores_str(data, items, agg_type)
else:
raise ValueError(
"Subscale columns are either expected as column names (list of strings) or "
"column indices (list of integers)!"
)
out[f"{score_name}_{key}"] = score
return out
def _compute_questionnaire_scores_int(data: pd.DataFrame, items: Sequence[int], agg_type: str):
if agg_type == "sum":
return data.iloc[:, to_idx(items)].sum(axis=1)
return data.iloc[:, to_idx(items)].mean(axis=1)
def _compute_questionnaire_scores_str(data: pd.DataFrame, items: Sequence[str], agg_type: str):
if agg_type == "sum":
return data.loc[:, items].sum(axis=1)
return data.loc[:, items].mean(axis=1)
def _get_cols(
data: pd.DataFrame, cols: Optional[Union[Sequence[Union[int, str]], Union[int, str]]] = None
) -> Sequence[Union[str, int]]:
if isinstance(cols, int):
cols = [cols]
if isinstance(cols, str):
cols = [cols]
if cols is None:
cols = list(data.columns)
return cols
def _get_bins(
data: Union[pd.DataFrame, pd.Series],
bins: Union[int, Sequence[float]],
col: Optional[Union[int, str]] = None,
first_min: Optional[bool] = False,
last_max: Optional[bool] = False,
) -> Union[int, Sequence[float]]:
if isinstance(bins, (int, pd.IntervalIndex)):
return bins
# ensure list
bins = list(bins)
if first_min:
min_val = _bin_scale_get_min_val(data, col)
if min_val < min(bins):
bins = [min_val - 0.01, *bins]
if last_max:
max_val = _bin_scale_get_max_val(data, col)
if max_val > max(bins):
bins = [*bins, max_val + 0.01]
return bins
def _bin_scale_get_min_val(data: Union[pd.DataFrame, pd.Series], col: Union[int, str]) -> float:
if isinstance(col, int):
return data.iloc[:, col].min()
if isinstance(col, str):
return data[col].min()
return data.min()
def _bin_scale_get_max_val(data: Union[pd.DataFrame, pd.Series], col: Union[int, str]) -> float:
if isinstance(col, int):
return data.iloc[:, col].max()
if isinstance(col, str):
return data[col].max()
return data.max()