Source code for biopsykit.utils.dataframe_handling

"""Module providing various functions for advanced handling of pandas dataframes."""
import re
from typing import Callable, Optional, Sequence, Union

import numpy as np
import pandas as pd
from biopsykit.utils._datatype_validation_helper import _assert_has_columns, _assert_has_index_levels, _assert_is_dtype
from biopsykit.utils.datatype_helper import CodebookDataFrame, is_codebook_dataframe

__all__ = [
    "apply_codebook",
    "add_space_to_camel",
    "camel_to_snake",
    "snake_to_camel",
    "convert_nan",
    "int_from_str_idx",
    "int_from_str_col",
    "multi_xs",
    "replace_missing_data",
    "stack_groups_percent",
    "wide_to_long",
]


[docs]def int_from_str_idx( data: pd.DataFrame, idx_levels: Union[str, Sequence[str]], regex: Union[str, Sequence[str]], func: Optional[Callable] = None, ) -> pd.DataFrame: """Extract integers from strings in index levels and set them as new index values. Parameters ---------- data : :class:`~pandas.DataFrame` data with index to extract information from idx_levels : str or list of str name of index level or list of index level names regex : str or list of str regex string or list of regex strings to extract integers from strings func : function, optional function to apply to the extracted integer values. This can, for example, be a lambda function which increments all integers by 1. Default: ``None`` Returns ------- :class:`~pandas.DataFrame` dataframe with new index values """ if isinstance(idx_levels, str): idx_levels = [idx_levels] if isinstance(regex, str): regex = [regex] if len(idx_levels) != len(regex): raise ValueError( "Number of values in 'regex' must match number of index levels in 'idx_levels'! " "Got idx_levels: {}, regex: {}.".format(idx_levels, regex) ) _assert_is_dtype(data, pd.DataFrame) _assert_has_index_levels(data, idx_levels, match_atleast=True, match_order=False) idx_names = data.index.names data = data.reset_index() for idx, reg in zip(idx_levels, regex): idx_col = data[idx].str.extract(reg).astype(int)[0] if func is not None: idx_col = func(idx_col) data[idx] = idx_col data = data.set_index(idx_names) return data
[docs]def int_from_str_col( data: pd.DataFrame, col_name: str, regex: str, func: Optional[Callable] = None ) -> Union[pd.Series]: """Extract integers from strings in the column of a dataframe and return it. Parameters ---------- data : :class:`~pandas.DataFrame` data with column names to extract information from col_name : str name of column with string values to extract regex : str regex string used to extract integers from string values func : function, optional function to apply to the extracted integer values. This can, for example, be a lambda function which increments all integers by 1. Default: ``None`` Returns ------- :class:`~pandas.Series` series object with extracted integer values """ _assert_is_dtype(data, pd.DataFrame) _assert_has_columns(data, [[col_name]]) column = data[col_name].str.extract(regex).astype(int)[0] if func is not None: column = func(column) return column
[docs]def camel_to_snake(name: str, lower: Optional[bool] = True): """Convert string in "camelCase" to "snake_case". .. note:: If all letters in ``name`` are capital letters the string will not be computed into snake_case because it is assumed to be an abbreviation. Parameters ---------- name : str string to convert from camelCase to snake_case lower : bool, optional ``True`` to convert all capital letters in to lower case ("actual" snake_case), ``False`` to keep capital letters, if present Returns ------- str string converted into snake_case Examples -------- >>> from biopsykit.utils.dataframe_handling import camel_to_snake >>> camel_to_snake("HelloWorld") hello_world >>> camel_to_snake("HelloWorld", lower=False) Hello_World >>> camel_to_snake("ABC") ABC """ if not name.isupper(): name = re.sub(r"(?<!^)(?=[A-Z])", "_", name) if lower: name = name.lower() return name
[docs]def snake_to_camel(name: str): """Convert string in "snake_case" to "camelCase". Parameters ---------- name : str string to convert from snake_case to camelCase """ return "".join(x.capitalize() or "_" for x in name.split("_"))
[docs]def add_space_to_camel(name: str) -> str: """Add space to string in "camelCase". Parameters ---------- name : str string to transform Returns ------- str string with space added Examples -------- >>> from biopsykit.utils.dataframe_handling import add_space_to_camel >>> add_space_to_camel("HelloWorld") Hello World >>> add_space_to_camel("ABC") ABC """ return re.sub(r"((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))", r" \1", name)
[docs]def replace_missing_data( data: pd.DataFrame, target_col: str, source_col: str, dropna: Optional[bool] = False, inplace: Optional[bool] = False, ) -> Optional[pd.DataFrame]: """Replace missing data in one column by data from another column. Parameters ---------- data : :class:`~pandas.DataFrame` input data with values to replace target_col : str target column, i.e., column in which missing values should be replaced source_col : str source column, i.e., column values used to replace missing values in ``target_col`` dropna : bool, optional whether to drop rows with missing values in ``target_col`` or not. Default: ``False`` inplace : bool, optional whether to perform the operation inplace or not. Default: ``False`` Returns ------- :class:`~pandas.DataFrame` or ``None`` dataframe with replaced missing values or ``None`` if ``inplace`` is ``True`` """ _assert_is_dtype(data, pd.DataFrame) if not inplace: data = data.copy() data[target_col].fillna(data[source_col], inplace=True) # noqa: PD002 if dropna: data = data.dropna(subset=[target_col]) if inplace: return None return data
[docs]def convert_nan( data: Union[pd.DataFrame, pd.Series], inplace: Optional[bool] = False ) -> Union[pd.DataFrame, pd.Series, None]: """Convert missing values to NaN. Data exported from programs like SPSS often uses negative integers to encode missing values because these negative numbers are "unrealistic" values. Use this function to convert these negative numbers to "actual" missing values: not-a-number (``NaN``). Values that will be replaced with ``NaN`` are -66, -77, -99 (integer and string representations). Parameters ---------- data : :class:`~pandas.DataFrame` or :class:`~pandas.Series` input data inplace : bool, optional whether to perform the operation inplace or not. Default: ``False`` Returns ------- :class:`~pandas.DataFrame` or ``None`` dataframe with converted missing values or ``None`` if ``inplace`` is ``True`` """ _assert_is_dtype(data, (pd.DataFrame, pd.Series)) if not inplace: data = data.copy() data = data.replace([-99.0, -77.0, -66.0, "-99", "-77", "-66"], np.nan) if inplace: return None return data
[docs]def multi_xs( data: Union[pd.DataFrame, pd.Series], keys: Union[str, Sequence[str]], level: Union[str, int, Sequence[str], Sequence[int]], drop_level: Optional[bool] = True, ) -> Union[pd.DataFrame, pd.Series]: """Return cross-section of multiple keys from the dataframe. This function internally calls the :meth:`pandas.DataFrame.xs` method, but it can take a list of key arguments to return multiple keys at once, in comparison to the original :meth:`~pandas.DataFrame.xs` method which only takes one possible key. Parameters ---------- data : :class:`~pandas.DataFrame` or :class:`~pandas.Series` input data to get cross-section from keys : str or list of str label(s) contained in the index, or partially in a :class:`~pandas.MultiIndex` level : str, int, or list of such in case of keys partially contained in a :class:`~pandas.MultiIndex`, indicate which index levels are used. Levels can be referred by label or position. drop_level : bool, optional if ``False``, returns object with same levels as self. Default: ``True`` Returns ------- :class:`~pandas.DataFrame` or :class:`~pandas.Series` cross-section from the original dataframe or series """ _assert_is_dtype(data, (pd.DataFrame, pd.Series)) if isinstance(keys, str): keys = [keys] levels = data.index.names data_xs = pd.concat({key: data.xs(key, level=level, drop_level=drop_level) for key in keys}, names=[level]) return data_xs.reorder_levels(levels).sort_index()
[docs]def stack_groups_percent( data: pd.DataFrame, hue: str, stacked: str, order: Optional[Sequence[str]] = None ) -> pd.DataFrame: """Create dataframe with stacked groups. To create a stacked bar chart, i.e. a plot with different bar charts along a categorical axis, where the variables of each bar chart are stacked along the value axis, the data needs to be rearranged and normalized in percent. The columns of the resulting dataframe be the categorical values specified by ``hue``, the index items will be the variables specified by ``stacked``. Parameters ---------- data : :class:`~pandas.DataFrame` data to compute stacked group in percent hue : str column name of grouping categorical variable. This typically corresponds to the ``x`` axis in a stacked bar chart. stacked : str column name of variable that is stacked along the ``y`` axis order : str order of categorical variable specified by ``hue`` Returns ------- :class:`~pandas.DataFrame` dataframe in a format that can be used to create a stacked bar chart See Also -------- :func:`~biopsykit.plotting.stacked_barchart` function to create a stacked bar chart """ data_grouped = pd.DataFrame(data.groupby([hue] + [stacked]).size(), columns=["data"]) data_grouped = data_grouped.groupby(hue).apply(lambda x: 100 * (x / x.sum())).T.stack().T if order is not None: data_grouped = data_grouped.reindex(order) return data_grouped["data"]
[docs]def apply_codebook(data: pd.DataFrame, codebook: CodebookDataFrame) -> pd.DataFrame: """Apply codebook to convert numerical to categorical values. The codebook is expected to be a dataframe in a standardized format (see :obj:`~biopsykit.utils.datatype_helper.CodebookDataFrame` for further information). Parameters ---------- codebook : :obj:`~biopsykit.utils.datatype_helper.CodebookDataFrame` path to codebook or dataframe to be used as codebook data : :class:`~pandas.DataFrame` data to apply codebook on Returns ------- :class:`~pandas.DataFrame` data with numerical values converted to categorical values See Also -------- :func:`~biopsykit.io.load_codebook` load Codebook Examples -------- >>> codebook = pd.DataFrame( >>> { >>> 0: [None, None, "Morning"], >>> 1: ["Male", "No", "Intermediate"], >>> 2: ["Female", "Not very often", "Evening"], >>> 3: [None, "Often", None], >>> 4: [None, "Very often", None] >>> }, >>> index=pd.Index(["gender", "smoking", "chronotype"], name="variable") >>> ) >>> apply_codebook(codebook, data) """ is_codebook_dataframe(codebook) for col in data.index.names: if col in codebook.index: data = data.rename(index=codebook.loc[col], level=col) for col in data.columns: if col in codebook.index: data.loc[:, col].replace(codebook.loc[col], inplace=True) # noqa: PD002 return data
[docs]def wide_to_long( data: pd.DataFrame, stubname: str, levels: Union[str, Sequence[str]], sep: Optional[str] = "_", ) -> pd.DataFrame: """Convert a dataframe wide-format into long-format. In the wide-format dataframe, the index levels to be converted into long-format are expected to be encoded in the column names and separated by ``sep``. If multiple levels should be converted into long-format, e.g., for a questionnaire with subscales (level `subscale`) that was assessed pre and post (level `time`), then the different levels are all encoded into the string. The level order is specified by ``levels``. Parameters ---------- data : :class:`~pandas.DataFrame` pandas DataFrame containing saliva data in wide-format, i.e. one column per saliva sample, one row per subject stubname : str common name for each column to be converted into long-format. Usually, this is either the name of the questionnaire (e.g., "PSS") or the saliva type (e.g., "cortisol"). levels : str or list of str index levels of the resulting long-format dataframe. sep : str, optional character separating index levels in the column names of the wide-format dataframe. Default: ``_`` Returns ------- :class:`~pandas.DataFrame` pandas DataFrame in long-format Examples -------- >>> data = pd.DataFrame( >>> columns=[ >>> "MDBF_GoodBad_pre", "MDBF_AwakeTired_pre", "MDBF_CalmNervous_pre", >>> "MDBF_GoodBad_post", "MDBF_AwakeTired_post", "MDBF_CalmNervous_post" >>> ], >>> index=pd.Index(range(0, 5), name="subject") >>> ) >>> data_long = wide_to_long(data, stubname="MDBF", levels=["subscale", "time"], sep="_") >>> print(data_long.index.names) ['subject', 'subscale', 'time'] >>> print(data_long.index) MultiIndex([(0, 'AwakeTired', 'post'), (0, 'AwakeTired', 'pre'), (0, 'CalmNervous', 'post'), (0, 'CalmNervous', 'pre'), (0, 'GoodBad', 'post'), (0, 'GoodBad', 'pre'), (1, 'AwakeTired', 'post'), ... """ if isinstance(levels, str): levels = [levels] data = data.filter(like=stubname) index_cols = list(data.index.names) if any(col is None for col in index_cols): raise ValueError( "All index levels of the dataframe need to have names! Please assign names using " "'pandas.Index.set_names()' before using this function!" ) # reverse level order because nested multi-level index will be constructed from back to front levels = levels[::-1] # iteratively build up long-format dataframe for i, level in enumerate(levels): stubnames = list(data.columns) # stubnames are everything except the last part separated by underscore stubnames = sorted({"_".join(s.split("_")[:-1]) for s in stubnames}) data = pd.wide_to_long( data.reset_index(), stubnames=stubnames, i=index_cols + levels[0:i], j=level, sep=sep, suffix=r"\w+", ) # reorder levels and sort return data.reorder_levels(index_cols + levels[::-1]).sort_index()