Source code for biopsykit.metadata.metadata

"""Module containing implementations to compute various metadata parameter.

Each function at least expects a dataframe containing the required columns in a specified order
(see function documentations for specifics) to be passed to the ``data`` argument.

If ``data`` is a dataframe that contains more than the required two columns, e.g., if the complete dataframe
is passed, the required columns can be sliced by specifying them in the ``columns`` parameter.

"""
from typing import Optional, Sequence, Union

import pandas as pd
from biopsykit.utils._datatype_validation_helper import (
    _assert_has_columns,
    _assert_has_index_levels,
    _assert_len_list,
    _assert_value_range,
)

__all__ = ["bmi", "whr", "gender_counts"]

from biopsykit.utils.exceptions import ValidationError


[docs]def gender_counts(
    data: pd.DataFrame, gender_col: Optional[str] = None, split_condition: Optional[bool] = False
) -> pd.DataFrame:
    """Get statistics about gender distribution from a dataset.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe with subjects
    gender_col : str, optional
        column name containing gender information or ``None`` to use default name ("gender").
    split_condition : bool, optional
        ``True`` to split gender distribution by condition (assumes that an "condition" index level is present in
        ``data``), ``False`` otherwise.
        Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame`
        dataframe with absolute and relative gender distribution

    """
    if gender_col is None:
        gender_col = "gender"
    if isinstance(gender_col, str):
        gender_col = [gender_col]

    try:
        _assert_len_list(gender_col, 1)
    except ValidationError as e:
        raise ValidationError(
            f"'gender_col' is excepted to be only one column! Got {len(gender_col)} columns instead."
        ) from e
    _assert_has_columns(data, [gender_col])
    data = data.loc[:, gender_col]
    if split_condition:
        _assert_has_index_levels(data, "condition", match_atleast=True)
        return data.groupby("condition").apply(_gender_counts)
    return _gender_counts(data)


def _gender_counts(data: pd.DataFrame):
    return pd.concat(
        [data.value_counts(sort=False), data.value_counts(normalize=True, sort=False) * 100],
        axis=1,
        keys=["count", "percent"],
    )


[docs]def bmi(data: pd.DataFrame, columns: Optional[Union[Sequence[str], pd.Index]] = None) -> pd.DataFrame:
    """Compute the **Body Mass Index**.

    This function assumes the required data in the following format:

    * 1st column: weight in kilogram
    * 2nd column: height in centimeter

    If ``data`` is a dataframe that contains more than the required two columns, e.g., if the complete questionnaire
    dataframe is passed, the required columns can be sliced by specifying them in the ``columns`` parameter.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe containing weight and height information
    columns: sequence of str, optional
        list of column names needed to compute body mass index. Only needed if ``data`` is a dataframe with more than
        the required columns for computing body mass index. Not needed if ``data`` only contains the required columns.
        Default: ``None``

    Returns
    -------
    :class:`~pandas.DataFrame`
        dataframe with body mass index

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValueRangeError`
        if input values or output values are not in the expected range, e.g., because values are provided in the
        wrong unit or columns are in the wrong order

    """
    score_name = "BMI"

    if columns is not None:
        # if columns parameter is supplied: slice columns from dataframe
        data = data.loc[:, columns]

    # weight
    _assert_value_range(data.iloc[:, 0], [10, 300])
    # height
    _assert_value_range(data.iloc[:, 1], [50, 250])

    data = pd.DataFrame(data.iloc[:, 0] / (data.iloc[:, 1] / 100.0) ** 2, columns=[score_name])
    # check if BMI is in a reasonable range
    _assert_value_range(data, [10, 50])
    return data.round(2)


[docs]def whr(data: pd.DataFrame, columns: Optional[Union[Sequence[str], pd.Index]] = None) -> pd.DataFrame:
    """Compute the **Waist to Hip Ratio**.

    This function assumes the required data in the following format:

    * 1st column: waist circumference
    * 2nd column: hip circumference

    If ``data`` is a dataframe that contains more than the required two columns, e.g., if the complete questionnaire
    dataframe is passed, the required columns can be sliced by specifying them in the ``columns`` parameter.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        dataframe containing waist and hip circumference
    columns: sequence of str, optional
        list of column names needed to compute body mass index. Only needed if ``data`` is a dataframe with more than
        the required columns for computing body mass index. Not needed if ``data`` only contains the required columns.
        Default: ``None``

    Returns
    -------
    :class:`~pandas.DataFrame`
        dataframe with waist to hip ratio

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValueRangeError`
        if input values or output values are not in the expected range, e.g., because values are provided in the
        wrong unit or column are in the wrong order

    """
    score_name = "WHR"

    if columns is not None:
        # if columns parameter is supplied: slice columns from dataframe
        data = data.loc[:, columns]

    data = pd.DataFrame(data.iloc[:, 0] / data.iloc[:, 1], columns=[score_name])
    # check if WHR is in a reasonable range
    _assert_value_range(data, [0.5, 1.5])
    return data.round(3)