Source code for biopsykit.utils.dataframe_handling

"""Module providing various functions for advanced handling of pandas dataframes."""
import re
from typing import Callable, Optional, Sequence, Union

import numpy as np
import pandas as pd
from biopsykit.utils._datatype_validation_helper import _assert_has_columns, _assert_has_index_levels, _assert_is_dtype
from biopsykit.utils.datatype_helper import CodebookDataFrame, is_codebook_dataframe

__all__ = [
    "apply_codebook",
    "add_space_to_camel",
    "camel_to_snake",
    "snake_to_camel",
    "convert_nan",
    "int_from_str_idx",
    "int_from_str_col",
    "multi_xs",
    "replace_missing_data",
    "stack_groups_percent",
    "wide_to_long",
]


[docs]def int_from_str_idx(
    data: pd.DataFrame,
    idx_levels: Union[str, Sequence[str]],
    regex: Union[str, Sequence[str]],
    func: Optional[Callable] = None,
) -> pd.DataFrame:
    """Extract integers from strings in index levels and set them as new index values.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        data with index to extract information from
    idx_levels : str or list of str
        name of index level or list of index level names
    regex : str or list of str
        regex string or list of regex strings to extract integers from strings
    func : function, optional
        function to apply to the extracted integer values. This can, for example, be a lambda function which
        increments all integers by 1. Default: ``None``


    Returns
    -------
    :class:`~pandas.DataFrame`
        dataframe with new index values

    """
    if isinstance(idx_levels, str):
        idx_levels = [idx_levels]
    if isinstance(regex, str):
        regex = [regex]

    if len(idx_levels) != len(regex):
        raise ValueError(
            "Number of values in 'regex' must match number of index levels in 'idx_levels'! "
            "Got idx_levels: {}, regex: {}.".format(idx_levels, regex)
        )

    _assert_is_dtype(data, pd.DataFrame)
    _assert_has_index_levels(data, idx_levels, match_atleast=True, match_order=False)

    idx_names = data.index.names
    data = data.reset_index()
    for idx, reg in zip(idx_levels, regex):
        idx_col = data[idx].str.extract(reg).astype(int)[0]
        if func is not None:
            idx_col = func(idx_col)
        data[idx] = idx_col

    data = data.set_index(idx_names)
    return data


[docs]def int_from_str_col(
    data: pd.DataFrame, col_name: str, regex: str, func: Optional[Callable] = None
) -> Union[pd.Series]:
    """Extract integers from strings in the column of a dataframe and return it.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        data with column names to extract information from
    col_name : str
        name of column with string values to extract
    regex : str
        regex string used to extract integers from string values
    func : function, optional
        function to apply to the extracted integer values. This can, for example, be a lambda function which
        increments all integers by 1. Default: ``None``


    Returns
    -------
    :class:`~pandas.Series`
        series object with extracted integer values

    """
    _assert_is_dtype(data, pd.DataFrame)
    _assert_has_columns(data, [[col_name]])

    column = data[col_name].str.extract(regex).astype(int)[0]
    if func is not None:
        column = func(column)
    return column


[docs]def camel_to_snake(name: str, lower: Optional[bool] = True):
    """Convert string in "camelCase" to "snake_case".

    .. note::
        If all letters in ``name`` are capital letters the string will not be computed into snake_case because
        it is assumed to be an abbreviation.

    Parameters
    ----------
    name : str
        string to convert from camelCase to snake_case
    lower : bool, optional
        ``True`` to convert all capital letters in to lower case ("actual" snake_case), ``False`` to keep
        capital letters, if present


    Returns
    -------
    str
        string converted into snake_case

    Examples
    --------
    >>> from biopsykit.utils.dataframe_handling import camel_to_snake
    >>> camel_to_snake("HelloWorld")
    hello_world
    >>> camel_to_snake("HelloWorld", lower=False)
    Hello_World
    >>> camel_to_snake("ABC")
    ABC

    """
    if not name.isupper():
        name = re.sub(r"(?<!^)(?=[A-Z])", "_", name)
        if lower:
            name = name.lower()
    return name


[docs]def snake_to_camel(name: str):
    """Convert string in "snake_case" to "camelCase".

    Parameters
    ----------
    name : str
        string to convert from snake_case to camelCase

    """
    return "".join(x.capitalize() or "_" for x in name.split("_"))


[docs]def add_space_to_camel(name: str) -> str:
    """Add space to string in "camelCase".

    Parameters
    ----------
    name : str
        string to transform

    Returns
    -------
    str
        string with space added

    Examples
    --------
    >>> from biopsykit.utils.dataframe_handling import add_space_to_camel
    >>> add_space_to_camel("HelloWorld")
    Hello World
    >>> add_space_to_camel("ABC")
    ABC

    """
    return re.sub(r"((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))", r" \1", name)


[docs]def replace_missing_data(
    data: pd.DataFrame,
    target_col: str,
    source_col: str,
    dropna: Optional[bool] = False,
    inplace: Optional[bool] = False,
) -> Optional[pd.DataFrame]:
    """Replace missing data in one column by data from another column.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        input data with values to replace
    target_col : str
        target column, i.e., column in which missing values should be replaced
    source_col : str
        source column, i.e., column values used to replace missing values in ``target_col``
    dropna : bool, optional
        whether to drop rows with missing values in ``target_col`` or not. Default: ``False``
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame` or ``None``
        dataframe with replaced missing values or ``None`` if ``inplace`` is ``True``

    """
    _assert_is_dtype(data, pd.DataFrame)
    if not inplace:
        data = data.copy()

    data[target_col].fillna(data[source_col], inplace=True)  # noqa: PD002
    if dropna:
        data = data.dropna(subset=[target_col])

    if inplace:
        return None
    return data


[docs]def convert_nan(
    data: Union[pd.DataFrame, pd.Series], inplace: Optional[bool] = False
) -> Union[pd.DataFrame, pd.Series, None]:
    """Convert missing values to NaN.

    Data exported from programs like SPSS often uses negative integers to encode missing values because these negative
    numbers are "unrealistic" values. Use this function to convert these negative numbers to
    "actual" missing values: not-a-number (``NaN``).

    Values that will be replaced with ``NaN`` are -66, -77, -99 (integer and string representations).

    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        input data
    inplace : bool, optional
        whether to perform the operation inplace or not. Default: ``False``

    Returns
    -------
    :class:`~pandas.DataFrame` or ``None``
        dataframe with converted missing values or ``None`` if ``inplace`` is ``True``

    """
    _assert_is_dtype(data, (pd.DataFrame, pd.Series))

    if not inplace:
        data = data.copy()
    data = data.replace([-99.0, -77.0, -66.0, "-99", "-77", "-66"], np.nan)
    if inplace:
        return None
    return data


[docs]def multi_xs(
    data: Union[pd.DataFrame, pd.Series],
    keys: Union[str, Sequence[str]],
    level: Union[str, int, Sequence[str], Sequence[int]],
    drop_level: Optional[bool] = True,
) -> Union[pd.DataFrame, pd.Series]:
    """Return cross-section of multiple keys from the dataframe.

    This function internally calls the :meth:`pandas.DataFrame.xs` method, but it can take a list of key arguments
    to return multiple keys at once, in comparison to the original :meth:`~pandas.DataFrame.xs` method which
    only takes one possible key.


    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        input data to get cross-section from
    keys : str or list of str
        label(s) contained in the index, or partially in a :class:`~pandas.MultiIndex`
    level : str, int, or list of such
        in case of keys partially contained in a :class:`~pandas.MultiIndex`, indicate which index levels are used.
        Levels can be referred by label or position.
    drop_level : bool, optional
        if ``False``, returns object with same levels as self. Default: ``True``


    Returns
    -------
    :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        cross-section from the original dataframe or series

    """
    _assert_is_dtype(data, (pd.DataFrame, pd.Series))
    if isinstance(keys, str):
        keys = [keys]
    levels = data.index.names
    data_xs = pd.concat({key: data.xs(key, level=level, drop_level=drop_level) for key in keys}, names=[level])
    return data_xs.reorder_levels(levels).sort_index()


[docs]def stack_groups_percent(
    data: pd.DataFrame, hue: str, stacked: str, order: Optional[Sequence[str]] = None
) -> pd.DataFrame:
    """Create dataframe with stacked groups.

    To create a stacked bar chart, i.e. a plot with different bar charts along a categorical axis,
    where the variables of each bar chart are stacked along the value axis, the data needs to be rearranged and
    normalized in percent.

    The columns of the resulting dataframe be the categorical values specified by ``hue``,
    the index items will be the variables specified by ``stacked``.


    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        data to compute stacked group in percent
    hue : str
        column name of grouping categorical variable. This typically corresponds to the ``x`` axis
        in a stacked bar chart.
    stacked : str
        column name of variable that is stacked along the ``y`` axis
    order : str
        order of categorical variable specified by ``hue``


    Returns
    -------
    :class:`~pandas.DataFrame`
        dataframe in a format that can be used to create a stacked bar chart

    See Also
    --------
    :func:`~biopsykit.plotting.stacked_barchart`
        function to create a stacked bar chart

    """
    data_grouped = pd.DataFrame(data.groupby([hue] + [stacked]).size(), columns=["data"])
    data_grouped = data_grouped.groupby(hue).apply(lambda x: 100 * (x / x.sum())).T.stack().T
    if order is not None:
        data_grouped = data_grouped.reindex(order)
    return data_grouped["data"]


[docs]def apply_codebook(data: pd.DataFrame, codebook: CodebookDataFrame) -> pd.DataFrame:
    """Apply codebook to convert numerical to categorical values.

    The codebook is expected to be a dataframe in a standardized format
    (see :obj:`~biopsykit.utils.datatype_helper.CodebookDataFrame` for further information).



    Parameters
    ----------
    codebook : :obj:`~biopsykit.utils.datatype_helper.CodebookDataFrame`
        path to codebook or dataframe to be used as codebook
    data : :class:`~pandas.DataFrame`
        data to apply codebook on

    Returns
    -------
    :class:`~pandas.DataFrame`
        data with numerical values converted to categorical values

    See Also
    --------
    :func:`~biopsykit.io.load_codebook`
        load Codebook

    Examples
    --------
    >>> codebook = pd.DataFrame(
    >>>     {
    >>>         0: [None, None, "Morning"],
    >>>         1: ["Male", "No", "Intermediate"],
    >>>         2: ["Female", "Not very often", "Evening"],
    >>>         3: [None, "Often", None],
    >>>         4: [None, "Very often", None]
    >>>     },
    >>>     index=pd.Index(["gender", "smoking", "chronotype"], name="variable")
    >>> )
    >>> apply_codebook(codebook, data)

    """
    is_codebook_dataframe(codebook)

    for col in data.index.names:
        if col in codebook.index:
            data = data.rename(index=codebook.loc[col], level=col)

    for col in data.columns:
        if col in codebook.index:
            data.loc[:, col].replace(codebook.loc[col], inplace=True)  # noqa: PD002

    return data


[docs]def wide_to_long(
    data: pd.DataFrame,
    stubname: str,
    levels: Union[str, Sequence[str]],
    sep: Optional[str] = "_",
) -> pd.DataFrame:
    """Convert a dataframe wide-format into long-format.

    In the wide-format dataframe, the index levels to be converted into long-format are expected to be encoded in the
    column names and separated by ``sep``. If multiple levels should be converted into long-format, e.g., for a
    questionnaire with subscales (level `subscale`) that was assessed pre and post (level `time`), then the different
    levels are all encoded into the string. The level order is specified by ``levels``.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        pandas DataFrame containing saliva data in wide-format, i.e. one column per saliva sample, one row per subject
    stubname : str
        common name for each column to be converted into long-format. Usually, this is either the name of the
        questionnaire (e.g., "PSS") or the saliva type (e.g., "cortisol").
    levels : str or list of str
        index levels of the resulting long-format dataframe.
    sep : str, optional
        character separating index levels in the column names of the wide-format dataframe. Default: ``_``


    Returns
    -------
    :class:`~pandas.DataFrame`
        pandas DataFrame in long-format


    Examples
    --------
    >>> data = pd.DataFrame(
    >>>     columns=[
    >>>         "MDBF_GoodBad_pre", "MDBF_AwakeTired_pre", "MDBF_CalmNervous_pre",
    >>>         "MDBF_GoodBad_post", "MDBF_AwakeTired_post",  "MDBF_CalmNervous_post"
    >>>     ],
    >>>     index=pd.Index(range(0, 5), name="subject")
    >>> )
    >>> data_long = wide_to_long(data, stubname="MDBF", levels=["subscale", "time"], sep="_")
    >>> print(data_long.index.names)
    ['subject', 'subscale', 'time']
    >>> print(data_long.index)
    MultiIndex([(0,  'AwakeTired', 'post'),
            (0,  'AwakeTired',  'pre'),
            (0, 'CalmNervous', 'post'),
            (0, 'CalmNervous',  'pre'),
            (0,     'GoodBad', 'post'),
            (0,     'GoodBad',  'pre'),
            (1,  'AwakeTired', 'post'),
            ...



    """
    if isinstance(levels, str):
        levels = [levels]

    data = data.filter(like=stubname)
    index_cols = list(data.index.names)

    if any(col is None for col in index_cols):
        raise ValueError(
            "All index levels of the dataframe need to have names! Please assign names using "
            "'pandas.Index.set_names()' before using this function!"
        )

    # reverse level order because nested multi-level index will be constructed from back to front
    levels = levels[::-1]
    # iteratively build up long-format dataframe
    for i, level in enumerate(levels):
        stubnames = list(data.columns)
        # stubnames are everything except the last part separated by underscore
        stubnames = sorted({"_".join(s.split("_")[:-1]) for s in stubnames})
        data = pd.wide_to_long(
            data.reset_index(),
            stubnames=stubnames,
            i=index_cols + levels[0:i],
            j=level,
            sep=sep,
            suffix=r"\w+",
        )

    # reorder levels and sort
    return data.reorder_levels(index_cols + levels[::-1]).sort_index()