Source code for biopsykit.classification.utils

"""Module with utility functions for machine learning and classification applications."""
from copy import deepcopy
from typing import Dict, Optional, Tuple, Union

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

__all__ = ["factorize_subject_id", "prepare_df_sklearn", "split_train_test", "strip_df", "strip_labels"]


class _PipelineWrapper:
    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline

    def __str__(self):
        return str(self.pipeline)

    def __repr__(self):
        return repr(self.pipeline)


# TODO check if works?
# def __getitem__(self, item):
#     return self.pipeline[item]
#
# def __len__(self):
#     return len(self.pipeline)


[docs]def strip_df(data: pd.DataFrame) -> np.ndarray:
    """Strip dataframe from all index levels to only contain values.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        input dataframe

    Returns
    -------
    :class:`~numpy.ndarray`
        array of stripped dataframe without index

    """
    return np.array(data.reset_index(drop=True).values)


[docs]def strip_labels(data: Union[pd.DataFrame, pd.Series], label_col: Optional[str] = None) -> np.ndarray:
    """Strip labels from dataframe index.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        input data
    label_col : str, optional
        name of index level containing class labels or ``None`` to use default column name ("label").
        Default: ``None``

    Returns
    -------
    :class:`~numpy.ndarray`
        array with labels

    """
    # TODO change to dataframe column
    if label_col is None:
        label_col = "label"
    if isinstance(data, pd.DataFrame):
        data = data.index.get_level_values(label_col)
    return np.array(data)


[docs]def factorize_subject_id(
    data: Union[pd.Series, pd.DataFrame], subject_col: Optional[str] = None
) -> Tuple[np.ndarray, np.ndarray]:
    """Factorize subject IDs, i.e., encode them as an enumerated type or categorical variable.

    Parameters
    ----------
    data : :class:`~pandas.DataFrame` or :class:`~pandas.Series`
        input data
    subject_col : str, optional
        name of index level containing subject IDs or ``None`` to use default column name ("subject").
        Default: ``None``

    Returns
    -------
    groups : :class:`~numpy.ndarray`
        A numpy array with factorized subject IDs. They also serve as indexer for ``keys``.
    keys : :class:`~numpy.ndarray`
        The unique subject ID values.

    """
    if subject_col is None:
        subject_col = "subject"

    if isinstance(data, pd.DataFrame):
        data = data.index.get_level_values(subject_col)
    groups, keys = pd.factorize(data)
    return groups, keys


[docs]def prepare_df_sklearn(
    data: pd.DataFrame,
    label_col: Optional[str] = None,
    subject_col: Optional[str] = None,
    print_summary: Optional[bool] = False,
) -> Tuple[np.ndarray, ...]:
    """Prepare a dataframe for usage in sklearn functions and return the single components of the dataframe.

    This function performs the following steps:

    * Strip dataframe from all index levels and return an array that only contains values
      (using :func:`~biopsykit.classification.utils.strip_df`)
    * Extract labels from dataframe (using :func:`~biopsykit.classification.utils.strip_labels`)
    * Factorize subject IDs so that each subject ID has an unique number
      (using :func:`~biopsykit.classification.utils.factorize_subject_id`)


    Parameters
    ----------
    data : :class:`~pandas.DataFrame`
        Input data as pandas dataframe
    label_col : str, optional
        name of index level containing class labels or ``None`` to use default column name ("label").
        Default: ``None``
    subject_col : str, optional
        name of index level containing subject IDs or ``None`` to use default column name ("subject").
        Default: ``None``
    print_summary : bool, optional
        ``True`` to print a summary of the shape of the data and label arrays, the number of groups and the class
        prevalence of all classes, ``False`` otherwise.
        Default: ``False``

    Returns
    -------
    X : array-like of shape (`n_samples`, `n_features`)
        Training vector, where `n_samples` is the number of samples and `n_features` is the number of features.
    y_data : array-like of shape (`n_samples`,)
        Target relative to ``X``, i.e. class labels.
    groups : array-like of shape (`n_samples`,)
        Factorized subject IDs
    group_keys : array-like of shape (`n_samples`,)
        Subject IDs

    """
    x_data = strip_df(data)
    y_data = strip_labels(data, label_col)
    groups, group_keys = factorize_subject_id(data, subject_col)

    if print_summary:
        print(
            "Shape of X: {}; shape of y: {}; number of groups: {}, class prevalence: {}".format(
                x_data.shape, y_data.shape, len(group_keys), np.unique(y_data, return_counts=True)[1]
            )
        )

    return x_data, y_data, groups, group_keys


[docs]def split_train_test(
    X: np.ndarray,  # noqa: N803
    y: np.ndarray,
    train: np.ndarray,
    test: np.ndarray,
    groups: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, ...]:
    """Split data into train and test set.

    Parameters
    ----------
    X : array-like of shape (`n_samples`, `n_features`)
        Data to be split, where `n_samples` is the number of samples and `n_features` is the number of features.
    y : array-like of shape (`n_samples`,)
        Target relative to ``x_data``, i.e. class labels.
    train : :class:`~numpy.ndarray`
        The training set indices for that split
    test : :class:`~numpy.ndarray`
        The test set indices for that split
    groups : array-like of shape (`n_samples`,), optional
        Group labels for the samples used while splitting the dataset into train/test set or ``None`` if group labels
        should not be considered for splitting.
        Default: ``None``


    Returns
    -------
    X_train: :class:`~numpy.ndarray`
        Training data
    X_test: :class:`~numpy.ndarray`
        Test data
    y_train: :class:`~numpy.ndarray`
        Targets of training data
    y_test: :class:`~numpy.ndarray`
        Targets of test data
    group_train: :class:`~numpy.ndarray`
        Group labels of training data (only available if ``groups`` is not ``None``)
    group_test: :class:`~numpy.ndarray`
        Group labels of test data (only available if ``groups`` is not ``None``)

    """
    X_train, X_test = X[train], X[test]  # noqa: N806
    y_train, y_test = y[train], y[test]

    if groups is None:
        return X_train, X_test, y_train, y_test

    groups_train = groups[train]
    groups_test = groups[test]
    return X_train, X_test, y_train, y_test, groups_train, groups_test


def merge_nested_dicts(dict1: Dict, dict2: Dict) -> Dict:
    """Merge two nested dictionaries.

    Parameters
    ----------
    dict1 : dict
        First dictionary to merge
    dict2 : dict
        Second dictionary to merge

    Returns
    -------
    dict
        Merged dictionary

    """
    dict1 = deepcopy(dict1)
    return _merge_nested_dicts(dict1, dict2)


def _merge_nested_dicts(dict1: Dict, dict2: Dict) -> Dict:
    for key, value in dict2.items():
        if isinstance(value, dict) and key in dict1:
            _merge_nested_dicts(dict1[key], value)
            # check if value is list
        elif isinstance(value, list) and key in dict1:
            dict1[key] = value if key not in dict1 else dict1[key] + value
            list_of_dicts = deepcopy(dict1[key])
            merged_dict = {}

            for d in list_of_dicts:
                for k, v in d.items():
                    # Use set to avoid duplicates, then convert it back to a list
                    merged_dict[k] = list(set(merged_dict.get(k, []) + list(v)))

            # Convert the merged result back to dictionaries
            result = [dict(merged_dict)]
            dict1[key] = result
        elif key not in dict1:
            dict1[key] = value
    return dict1