Source code for biopsykit.classification.utils

"""Module with utility functions for machine learning and classification applications."""

from copy import deepcopy

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

__all__ = ["factorize_subject_id", "prepare_df_sklearn", "split_train_test", "strip_df", "strip_labels"]


class _PipelineWrapper:
    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline

    def __str__(self):
        return str(self.pipeline)

    def __repr__(self):
        return repr(self.pipeline)


# TODO check if works?
# def __getitem__(self, item):
#     return self.pipeline[item]
#
# def __len__(self):
#     return len(self.pipeline)


[docs]def strip_df(data: pd.DataFrame) -> np.ndarray: """Strip dataframe from all index levels to only contain values. Parameters ---------- data : :class:`~pandas.DataFrame` input dataframe Returns ------- :class:`~numpy.ndarray` array of stripped dataframe without index """ return np.array(data.reset_index(drop=True).values)
[docs]def strip_labels(data: pd.DataFrame | pd.Series, label_col: str | None = None) -> np.ndarray: """Strip labels from dataframe index. Parameters ---------- data : :class:`~pandas.DataFrame` or :class:`~pandas.Series` input data label_col : str, optional name of index level containing class labels or ``None`` to use default column name ("label"). Default: ``None`` Returns ------- :class:`~numpy.ndarray` array with labels """ # TODO change to dataframe column if label_col is None: label_col = "label" if isinstance(data, pd.DataFrame): data = data.index.get_level_values(label_col) return np.array(data)
[docs]def factorize_subject_id( data: pd.Series | pd.DataFrame, subject_col: str | None = None ) -> tuple[np.ndarray, np.ndarray]: """Factorize subject IDs, i.e., encode them as an enumerated type or categorical variable. Parameters ---------- data : :class:`~pandas.DataFrame` or :class:`~pandas.Series` input data subject_col : str, optional name of index level containing subject IDs or ``None`` to use default column name ("subject"). Default: ``None`` Returns ------- groups : :class:`~numpy.ndarray` A numpy array with factorized subject IDs. They also serve as indexer for ``keys``. keys : :class:`~numpy.ndarray` The unique subject ID values. """ if subject_col is None: subject_col = "subject" if isinstance(data, pd.DataFrame): data = data.index.get_level_values(subject_col) groups, keys = pd.factorize(data) return groups, keys
[docs]def prepare_df_sklearn( data: pd.DataFrame, label_col: str | None = None, subject_col: str | None = None, print_summary: bool | None = False, ) -> tuple[np.ndarray, ...]: """Prepare a dataframe for usage in sklearn functions and return the single components of the dataframe. This function performs the following steps: * Strip dataframe from all index levels and return an array that only contains values (using :func:`~biopsykit.classification.utils.strip_df`) * Extract labels from dataframe (using :func:`~biopsykit.classification.utils.strip_labels`) * Factorize subject IDs so that each subject ID has an unique number (using :func:`~biopsykit.classification.utils.factorize_subject_id`) Parameters ---------- data : :class:`~pandas.DataFrame` Input data as pandas dataframe label_col : str, optional name of index level containing class labels or ``None`` to use default column name ("label"). Default: ``None`` subject_col : str, optional name of index level containing subject IDs or ``None`` to use default column name ("subject"). Default: ``None`` print_summary : bool, optional ``True`` to print a summary of the shape of the data and label arrays, the number of groups and the class prevalence of all classes, ``False`` otherwise. Default: ``False`` Returns ------- X : array-like of shape (`n_samples`, `n_features`) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. y_data : array-like of shape (`n_samples`,) Target relative to ``X``, i.e. class labels. groups : array-like of shape (`n_samples`,) Factorized subject IDs group_keys : array-like of shape (`n_samples`,) Subject IDs """ x_data = strip_df(data) y_data = strip_labels(data, label_col) groups, group_keys = factorize_subject_id(data, subject_col) if print_summary: print( f"Shape of X: {x_data.shape}; shape of y: {y_data.shape}; " f"number of groups: {len(group_keys)}, class prevalence: {np.unique(y_data, return_counts=True)[1]}" ) return x_data, y_data, groups, group_keys
[docs]def split_train_test( X: np.ndarray, # noqa: N803 y: np.ndarray, train: np.ndarray, test: np.ndarray, groups: np.ndarray | None = None, ) -> tuple[np.ndarray, ...]: """Split data into train and test set. Parameters ---------- X : array-like of shape (`n_samples`, `n_features`) Data to be split, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (`n_samples`,) Target relative to ``x_data``, i.e. class labels. train : :class:`~numpy.ndarray` The training set indices for that split test : :class:`~numpy.ndarray` The test set indices for that split groups : array-like of shape (`n_samples`,), optional Group labels for the samples used while splitting the dataset into train/test set or ``None`` if group labels should not be considered for splitting. Default: ``None`` Returns ------- X_train: :class:`~numpy.ndarray` Training data X_test: :class:`~numpy.ndarray` Test data y_train: :class:`~numpy.ndarray` Targets of training data y_test: :class:`~numpy.ndarray` Targets of test data group_train: :class:`~numpy.ndarray` Group labels of training data (only available if ``groups`` is not ``None``) group_test: :class:`~numpy.ndarray` Group labels of test data (only available if ``groups`` is not ``None``) """ X_train, X_test = X[train], X[test] # noqa: N806 y_train, y_test = y[train], y[test] if groups is None: return X_train, X_test, y_train, y_test groups_train = groups[train] groups_test = groups[test] return X_train, X_test, y_train, y_test, groups_train, groups_test
def merge_nested_dicts(dict1: dict, dict2: dict) -> dict: """Merge two nested dictionaries. Parameters ---------- dict1 : dict First dictionary to merge dict2 : dict Second dictionary to merge Returns ------- dict Merged dictionary """ dict1 = deepcopy(dict1) return _merge_nested_dicts(dict1, dict2) def _merge_nested_dicts(dict1: dict, dict2: dict) -> dict: for key, value in dict2.items(): if isinstance(value, dict) and key in dict1: _merge_nested_dicts(dict1[key], value) # check if value is list elif isinstance(value, list) and key in dict1: dict1[key] = value if key not in dict1 else dict1[key] + value list_of_dicts = deepcopy(dict1[key]) merged_dict = {} for d in list_of_dicts: for k, v in d.items(): # Use set to avoid duplicates, then convert it back to a list merged_dict[k] = list(set(merged_dict.get(k, []) + list(v))) # Convert the merged result back to dictionaries result = [dict(merged_dict)] dict1[key] = result elif key not in dict1: dict1[key] = value return dict1