Source code for biopsykit.classification.model_selection.sklearn_pipeline_permuter

"""Module for systematically evaluating different combinations of sklearn pipelines."""
import functools
import pickle
import re
from copy import deepcopy
from inspect import getmembers, signature
from itertools import product
from pathlib import Path
from shutil import rmtree
from typing import Any, Dict, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
import sklearn.metrics
from biopsykit.classification.model_selection import nested_cv_param_search
from biopsykit.classification.utils import _PipelineWrapper, merge_nested_dicts
from biopsykit.utils._datatype_validation_helper import _assert_file_extension
from biopsykit.utils._types import path_t, str_t
from joblib import Memory
from numpy.random import RandomState
from sklearn.base import BaseEstimator, clone
from sklearn.model_selection import BaseCrossValidator
from sklearn.pipeline import Pipeline
from import tqdm
from typing_extensions import Self

__all__ = ["SklearnPipelinePermuter"]

pipeline_step_map = {
    "pipeline_scaler": "Scaler",
    "pipeline_reduce_dim": r"\makecell[lc]{Feature\\ Selection}",
    "pipeline_clf": "Classifier",

metric_map = {
    "accuracy": r"\makecell{Accuracy [\%]}",
    "f1": r"\makecell{F1-score [\%]}",
    "precision": r"\makecell{Precision [\%]}",
    "recall": r"\makecell{Recall [\%]}",
    "auc": r"\makecell{AUC [\%]}",
    "sensitivity": r"\makecell{Sensitivity [\%]}",
    "specificity": r"\makecell{Specificity [\%]}",

clf_map = {
    "MinMaxScaler": "Min-Max",
    "StandardScaler": "Standard",
    "SelectKBest": "SkB",
    "RFE": "RFE",
    "SelectFromModel": "SFM",
    "GaussianNB": "NB",
    "KNeighborsClassifier": "kNN",
    "DecisionTreeClassifier": "DT",
    "SVC": "SVM",
    "RandomForestClassifier": "RF",
    "MLPClassifier": "MLP",
    "AdaBoostClassifier": "Ada",

[docs]class SklearnPipelinePermuter: """Class for systematically evaluating different sklearn pipeline combinations.""" def __init__( self, model_dict: Optional[Dict[str, Dict[str, BaseEstimator]]] = None, param_dict: Optional[Dict[str, Optional[Union[Sequence[Dict[str, Any]], Dict[str, Any]]]]] = None, hyper_search_dict: Optional[Dict[str, Dict[str, Any]]] = None, random_state: Optional[int] = None, ): """Class for systematically evaluating different sklearn pipeline combinations. This class can be used to, for instance, evaluate combinations of different feature selection methods (e.g., :class:`~sklearn.feature_selection.SelectKBest`, :class:`~sklearn.feature_selection.SequentialFeatureSelector`) with different estimators (e.g., :class:`~sklearn.svm.SVC`, :class:`~sklearn.tree.DecisionTreeClassifier`), any much more. For all combinations, hyperparameter search (e.g., using grid-search or randomized-search) can be performed by passing one joint parameter grid (see Examples). Parameters ---------- model_dict : dict Dictionary specifying the different transformers and estimators to evaluate. Each pipeline step corresponds to one dictionary entry and has the name of the pipeline step (str) as key. The values are again dictionaries with the transformer/estimator names as keys and instances of the transformers/estimators as values param_dict : dict Nested dictionary specifying the parameter settings to try per transformer/estimator. The dictionary has the transformer/estimator names (str) as keys and parameter dictionaries as values. Each parameter dictionary has parameters names (str) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. hyper_search_dict : dict, optional Nested dictionary specifying the method for hyperparameter search (e.g., whether to use "grid" for grid-search or "random" for randomized-search) for each estimator. By default, "grid-search" is used for each estimator unless individually specified otherwise. random_state : int, optional Controls the random seed passed to each estimator and each splitter. By default, no random seed is passed. Set this to an integer for reproducible results across multiple program calls. Examples -------- >>> from sklearn import datasets >>> from sklearn.preprocessing import StandardScaler, MinMaxScaler >>> from sklearn.feature_selection import SelectKBest, RFE >>> from sklearn.neighbors import KNeighborsClassifier >>> from sklearn.svm import SVC >>> from sklearn.tree import DecisionTreeClassifier >>> from sklearn.ensemble import AdaBoostClassifier >>> from sklearn.model_selection import KFold >>> >>> from biopsykit.classification.model_selection import SklearnPipelinePermuter >>> >>> breast_cancer = datasets.load_breast_cancer() >>> X = >>> y = >>> >>> model_dict = { >>> "scaler": { >>> "StandardScaler": StandardScaler(), >>> "MinMaxScaler": MinMaxScaler(), >>> }, >>> "reduce_dim": { >>> "SelectKBest": SelectKBest(), >>> "RFE": RFE(SVC(kernel="linear", C=1)) >>> }, >>> "clf" : { >>> "KNeighborsClassifier": KNeighborsClassifier(), >>> "DecisionTreeClassifier": DecisionTreeClassifier(), >>> "SVC": SVC(), >>> "AdaBoostClassifier": AdaBoostClassifier(), >>> } >>> } >>> >>> param_dict = { >>> "StandardScaler": None, >>> "MinMaxScaler": None, >>> "SelectKBest": { "k": [2, 4, 6, 8, "all"] }, >>> "RFE": { "n_features_to_select": [2, 4, 6, 8, None] }, >>> "KNeighborsClassifier": { "n_neighbors": [2, 4, 6, 8], "weights": ["uniform", "distance"] }, >>> "DecisionTreeClassifier": {"criterion": ['gini', 'entropy'], "max_depth": [2, 4, 6, 8, 10] }, >>> "AdaBoostClassifier": { >>> "base_estimator": [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)], >>> "n_estimators": np.arange(20, 210, 10), >>> "learning_rate": np.arange(0.6, 1.1, 0.1) >>> }, >>> "SVC": [ >>> { >>> "kernel": ["linear"], >>> "C": np.logspace(start=-3, stop=3, num=7) >>> }, >>> { >>> "kernel": ["rbf"], >>> "C": np.logspace(start=-3, stop=3, num=7), >>> "gamma": np.logspace(start=-3, stop=3, num=7) >>> } >>> ] >>> } >>> >>> # AdaBoost hyperparameters should be optimized using randomized-search, all others using grid-search >>> hyper_search_dict = { >>> "AdaBoostClassifier": {"search_method": "random", "n_iter": 30} >>> } >>> >>> pipeline_permuter = SklearnPipelinePermuter(model_dict, param_dict, hyper_search_dict) >>>, y, outer_cv=KFold(), inner_cv=KFold()) """ self.models: Dict[str, Dict[str, BaseEstimator]] = {} """Dictionary with pipeline steps and the different transformers/estimators per step.""" self.params: Dict[str, Optional[Union[Sequence[Dict[str, Any]], Dict[str, Any]]]] = {} """Dictionary with parameter sets to test for the different transformers/estimators per pipeline step.""" self.model_combinations: Sequence[Tuple[Tuple[str, str], ...]] = [] """List of model combinations, i.e. permutations of the different transformers/estimators for each pipeline step.""" self.hyper_search_dict: Dict[str, Dict[str, Any]] = {} """Dictionary specifying the selected hyperparameter search method for each estimator.""" self.param_searches: Dict[Tuple[str, str], Dict[str, Any]] = {} """Dictionary with parameter search results for each pipeline step combination.""" self.results: Optional[pd.DataFrame] = None """Dataframe with parameter search results of each pipeline step combination.""" self.scoring: str_t = "" """Scoring used as metric for optimization during hyperparameter search.""" self.refit: str = "" self.random_state: Optional[RandomState] = None self._results_set: bool = False if model_dict is None and param_dict is None: # create empty instance return self.random_state = RandomState(random_state) self._set_permuter_params(model_dict, param_dict, hyper_search_dict) def _set_permuter_params(self, model_dict, param_dict, hyper_search_dict): self._check_missing_params(model_dict, param_dict) if hyper_search_dict is None: hyper_search_dict = {} self.hyper_search_dict = hyper_search_dict.copy() clf_list = model_dict[list(model_dict.keys())[-1]] for clf in clf_list: # fill the dict with the default search method (grid-search) for the classifiers that are not # specified explicitly self.hyper_search_dict.setdefault(clf, {"search_method": "grid"}) model_combinations = list(product(*[[(step, k) for k in list(model_dict[step].keys())] for step in model_dict])) # assert that all entries of the param dict are lists for uniform handling for k, v in param_dict.items(): if isinstance(v, dict): param_dict[k] = [v] model_dict = deepcopy(model_dict) self.models = self._initialize_models(model_dict) self.params = param_dict self.model_combinations = model_combinations def _initialize_models(self, model_dict: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]: if self.random_state is None: return model_dict for _k, v in model_dict.items(): # add fixed random state to each estimator if it has a random_state parameter for estimator in v.values(): if hasattr(estimator, "random_state"): estimator.random_state = self.random_state return model_dict @property def results(self): """Parameter search results of each pipeline step combination. Returns ------- :class:`~pandas.DataFrame` Dataframe with parameter search results of each pipeline step combination """ if self._results is None: self._results = self.pipeline_score_results() return self._results @results.setter def results(self, results): if results is None: self._results_set = False else: self._results_set = True self._results = results
[docs] @classmethod def from_csv(cls, file_path: path_t, num_pipeline_steps: Optional[int] = 3) -> Self: """Create a new ``SklearnPipelinePermute`` instance from a csv file with exported results from parameter search. Parameters ---------- file_path : :class:`pathlib.Path` or str path to csv file num_pipeline_steps : int integer specifying the number of steps in the pipeline. Used to infer pipeline steps from the :class:`~pandas.MultiIndex` in the dataframe. For instance, for a pipeline consisting of the steps "scaler", "reduce_dim", and "clf" pass "3" as ``num_pipeline_steps`` Returns ------- :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` ``SklearnPipelinePermuter`` instance with results from csv file """ # assert pathlib file_path = Path(file_path) _assert_file_extension(file_path, ".csv") score_summary = pd.read_csv(file_path) score_summary = score_summary.set_index(list(score_summary.columns)[: num_pipeline_steps + 2]) pipeline_permuter = SklearnPipelinePermuter() pipeline_permuter.results = score_summary return pipeline_permuter
[docs] def fit( self, X: np.ndarray, # noqa: N803 y: np.ndarray, *, outer_cv: BaseCrossValidator, inner_cv: BaseCrossValidator, scoring: Optional[str_t] = None, use_cache: Optional[bool] = True, **kwargs, ): """Run fit for all pipeline combinations and sets of parameters. This function calls :func:`~biopsykit.classification.model_selection.nested_cv_param_search` for all Pipeline combinations and stores the results in the ``param_searches`` attribute. Parameters ---------- X : array-like of shape (`n_samples`, `n_features`) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (`n_samples`, `n_output`) or (`n_samples`,) Target (i.e., class labels) relative to X for classification or regression. outer_cv : `CV splitter`_ Cross-validation object determining the cross-validation splitting strategy of the outer cross-validation. inner_cv : `CV splitter`_ Cross-validation object determining the cross-validation splitting strategy of the hyperparameter search. scoring : str, optional A str specifying the scoring metric to use for evaluation. use_cache : bool, optional ``True`` to cache fitted transformer instances of the pipeline in a caching directory (can be provided by the additional parameter ``cachedir_name``), ``False`` otherwise. Default: ``True`` **kwargs : Additional arguments that are passed to :func:`~biopsykit.classification.model_selection.nested_cv_parameter_search` and the hyperparameter search class instance (e.g., :class:`~sklearn.model_selection.GridSearchCV` or :class:`~sklearn.model_selection.RandomizedSearchCV`). """ return self._fit(X=X, y=y, outer_cv=outer_cv, inner_cv=inner_cv, scoring=scoring, use_cache=use_cache, **kwargs)
def _fit( # noqa: PLR0912, C901 self, *, X: np.ndarray, # noqa: N803 y: np.ndarray, outer_cv: BaseCrossValidator, inner_cv: BaseCrossValidator, save_intermediate: Optional[bool] = False, file_path: Optional[path_t] = None, scoring: Optional[str_t] = None, use_cache: Optional[bool] = True, **kwargs, ): self.results = None if len(self.model_combinations) == 0: raise ValueError("No model combinations specified. Please specify at least one model combination.") kwargs.setdefault("n_jobs", -1) kwargs.setdefault("verbose", 1) kwargs.setdefault("error_score", "raise") # Create a temporary folder to store the transformers of the pipeline location = kwargs.pop("cachedir_name", "cachedir") memory = None if use_cache: memory = Memory(location=location, verbose=0) if scoring is None: scoring = "accuracy" self.scoring = scoring refit = kwargs.get("refit") if refit is None: refit = scoring self.refit = refit for model_combination in tqdm(self.model_combinations, desc="Pipeline Combinations"): if model_combination in self.param_searches: print(f"Skipping {model_combination} since this combination was already fitted!") # continue if we already tried this combination continue pipeline_params = [(m, self.params[k[1]]) for m, k in zip(self.models.keys(), model_combination)] pipeline_params = list(filter(lambda p: p[1] is not None, pipeline_params)) pipeline_params = [(m, k_new) for m, k in pipeline_params for k_new in k if k is not None] cats = {p[0] for p in pipeline_params} pipeline_params = [list(filter(lambda p, c=cat: p[0] == c, pipeline_params)) for cat in cats] pipeline_params = list(product(*pipeline_params)) pipeline_params = [ tuple({f"{step[0]}__{k}": v for k, v in step[1].items()} for step in combi) for combi in pipeline_params ] pipeline_params = [{k: v for x in param for k, v in x.items()} for param in pipeline_params] if kwargs["verbose"] >= 1: print( f"### Running hyperparameter search for pipeline: " f"{model_combination} with {len(pipeline_params)} parameter grid(s):" ) for j, param_dict in enumerate(pipeline_params): hyper_search_params = self.hyper_search_dict[model_combination[-1][1]] model_cls = [(step, self.models[step][m]) for step, m in model_combination] for i in range(len(model_cls)): if isinstance(model_cls[i][1], BaseEstimator): model_cls[i] = (model_cls[i][0], clone(model_cls[i][1])) pipeline = Pipeline(model_cls, memory=memory) if kwargs["verbose"] >= 1: print(f"Parameter grid #{j} ({hyper_search_params}): {param_dict}") result_dict = nested_cv_param_search( X, y, param_dict=param_dict, pipeline=pipeline, outer_cv=outer_cv, inner_cv=inner_cv, scoring=scoring, hyper_search_params=hyper_search_params, random_state=self.random_state, **kwargs, ) self.param_searches[model_combination] = result_dict if kwargs["verbose"] >= 1: print("") if save_intermediate: # Save intermediate results to file self.to_pickle(file_path) if kwargs["verbose"] >= 1: print("") if use_cache: # Delete the temporary cache before exiting memory.clear(warn=False) rmtree(location)
[docs] def fit_and_save_intermediate( self, X: np.ndarray, # noqa: N803 y: np.ndarray, *, outer_cv: BaseCrossValidator, inner_cv: BaseCrossValidator, file_path: path_t, scoring: Optional[str_t] = None, use_cache: Optional[bool] = True, **kwargs, ): """Run fit for all pipeline combinations and sets of parameters and save intermediate results to file. This function calls :func:`~biopsykit.classification.model_selection.nested_cv_param_search` for all Pipeline combinations and stores the results in the ``param_searches`` attribute. After each model combination, the results are saved to a pickle file. Parameters ---------- X : array-like of shape (`n_samples`, `n_features`) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (`n_samples`, `n_output`) or (`n_samples`,) Target (i.e., class labels) relative to X for classification or regression. outer_cv : `CV splitter`_ Cross-validation object determining the cross-validation splitting strategy of the outer cross-validation. inner_cv : `CV splitter`_ Cross-validation object determining the cross-validation splitting strategy of the hyperparameter search. file_path : :class:`pathlib.Path` or str path to pickle file scoring : str, optional A str specifying the scoring metric to use for evaluation. use_cache : bool, optional ``True`` to cache fitted transformer instances of the pipeline in a caching directory (can be provided by the additional parameter ``cachedir_name``), ``False`` otherwise. Default: ``True`` **kwargs : Additional arguments that are passed to :func:`~biopsykit.classification.model_selection.nested_cv_parameter_search` and the hyperparameter search class instance (e.g., :class:`~sklearn.model_selection.GridSearchCV` or :class:`~sklearn.model_selection.RandomizedSearchCV`). """ return self._fit( X=X, y=y, outer_cv=outer_cv, inner_cv=inner_cv, save_intermediate=True, file_path=file_path, scoring=scoring, use_cache=use_cache, **kwargs, )
[docs] @functools.lru_cache(maxsize=5) # noqa: B019 def pipeline_score_results(self) -> pd.DataFrame: """Return parameter search results for each pipeline combination. Returns ------- :class:`~pandas.DataFrame` dataframe with parameter search results for each pipeline combination """ if self._results_set: return self.results if len(self.param_searches) == 0: raise AttributeError( "No results available because pipelines were not fitted! Call `` first." ) gs_param_list = [] for param, gs in self.param_searches.items(): dict_folds = {} for i, res in enumerate(gs["cv_results"]): df_res = pd.DataFrame(res) df_res = df_res.drop(columns=df_res.filter(like="time").columns) = "parameter_combination_id" # note: the best_estimator from the fold does not necessarily correspond to the pipeline returned # from best_pipeline() because the best_pipeline is determined by averaging over all folds and # can hence have a different set of hyperparameters # best_estimator = gs["best_estimator"][i] # df_res["best_estimator"] = _PipelineWrapper(best_estimator) dict_folds[i] = df_res param_dict = {f"pipeline_{key}": val for key, val in param} df_gs = pd.concat(dict_folds, names=["outer_fold"]) df_gs[list(param_dict.keys())] = [list(param_dict.values())] * len(df_gs) df_gs = df_gs.set_index(list(df_gs.filter(like="pipeline").columns), append=True) # reorder levels so that pipeline steps are the first index levels, then the hyperparameter # combination id, then the outer_fold id gs_param_list.append(df_gs) df_summary = pd.concat(gs_param_list) df_summary = df_summary.infer_objects() df_summary = df_summary.reorder_levels( df_summary.index.names[2:] + [df_summary.index.names[1]] + [df_summary.index.names[0]] ) self.results = df_summary.sort_index().sort_index(axis=1) return self.results
[docs] def metric_summary( self, additional_metrics: Optional[str_t] = None, pos_label: Optional[str] = None ) -> pd.DataFrame: """Return summary with all performance metrics for the `best-performing estimator` of each pipeline combination. The `best-performing estimator` for each pipeline combination is the `best_estimator_` that :class:`~sklearn.model_selection.GridSearchCV` returns for each outer fold, i.e. the pipeline which yielded the highest average test score (over all inner folds). Parameters ---------- additional_metrics : str or list of str, optional additional metrics to compute. Default: ``None``. Available metrics can be found in scikit-learn's `metrics and scoring <>`_ module. pos_label : str, optional positive label for binary classification, must be specified if `additional_metrics` is specified. Returns ------- :class:`~pandas.DataFrame` dataframe with performance metric summary the `best estimator` of each pipeline combination. """ if len(self.param_searches) == 0: raise AttributeError( "No results available because pipelines were not fitted! Call `` first." ) list_metric_summary = [] for param_key, param_value in self.param_searches.items(): param_dict = {f"pipeline_{key}": val for key, val in param_key} conf_matrix = np.sum(param_value["conf_matrix"], axis=0) true_labels = np.array(param_value["true_labels"], dtype="object") predicted_labels = np.array(param_value["predicted_labels"], dtype="object") train_indices = np.array(param_value["train_indices"], dtype="object") test_indices = np.array(param_value["test_indices"], dtype="object") df_metric = pd.DataFrame(param_dict, index=[0]) df_metric["conf_matrix"] = [list(conf_matrix.flatten())] df_metric["conf_matrix_folds"] = [[cm.flatten() for cm in param_value["conf_matrix"]]] df_metric["true_labels"] = [np.concatenate(true_labels)] df_metric["true_labels_folds"] = [true_labels] df_metric["predicted_labels"] = [np.concatenate(predicted_labels)] df_metric["predicted_labels_folds"] = [predicted_labels] df_metric["train_indices"] = [np.concatenate(train_indices)] df_metric["train_indices_folds"] = [train_indices] df_metric["test_indices"] = [np.concatenate(test_indices)] df_metric["test_indices_folds"] = [test_indices] scoring = self.scoring if isinstance(scoring, str): scoring = [scoring] for score_key in scoring: key = f"test_{score_key}" test_scores = self.param_searches[param_key][key] df_metric[f"mean_{key}"] = np.mean(test_scores) df_metric[f"std_{key}"] = np.std(test_scores) df_metric[[f"{key}_fold_{i}" for i in range(len(test_scores))]] = list(test_scores) df_metric = df_metric.set_index(list(df_metric.columns)[: len(param_dict)]) list_metric_summary.append(df_metric) metric_summary = pd.concat(list_metric_summary) if additional_metrics is not None: metric_summary = self.compute_additional_metrics( metric_summary, metrics=additional_metrics, pos_label=pos_label ) return metric_summary
[docs] def export_pipeline_score_results(self, file_path: path_t) -> None: """Export pipeline score results as csv file. Parameters ---------- file_path : :class:`~pathlib.Path` or str file path to export """ file_path = Path(file_path) _assert_file_extension(file_path, ".csv") self.results.to_csv(file_path)
[docs] def export_metric_summary(self, file_path: path_t) -> None: """Export performance metric summary as csv file. Parameters ---------- file_path : :class:`~pathlib.Path` or str file path to export """ file_path = Path(file_path) _assert_file_extension(file_path, ".csv") self.metric_summary().to_csv(file_path, sep=";")
[docs] def best_estimator_summary(self) -> pd.DataFrame: """Return a dataframe with the `best estimator` instances of all pipeline combinations for each fold. Each entry of the dataframe is a list of :class:`~sklearn.pipeline.Pipeline` objects whe returned the . Returns ------- :class:`~pandas.DataFrame` dataframe with `best estimator` instances """ best_estimator_list = [] for param_key, param_value in self.param_searches.items(): param_dict = {f"pipeline_{key}": val for key, val in param_key} df_be = pd.DataFrame(param_dict, index=[0]) df_be["best_estimator"] = _PipelineWrapper(param_value["best_estimator"]) df_be = df_be.set_index(list(df_be.columns)[:-1]) best_estimator_list.append(df_be) return pd.concat(best_estimator_list)
[docs] @functools.lru_cache(maxsize=5) # noqa: B019 def mean_pipeline_score_results(self) -> pd.DataFrame: """Compute mean score results for each pipeline combination and hyperparameter combination. Returns ------- :class:`~pandas.DataFrame` dataframe with mean score results for each pipeline combination and each parameter combination, sorted by the highest mean score. Notes ----- The pipeline with the highest "mean over the mean test scores" does not necessarily correspond to the best-performing pipeline as returned by :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.metric_summary` or :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.best_estimator_summary` because the best-performing pipelines are determined by averaging the `best_estimator` instances, as determined by `scikit-learn` over all folds. Hence, all `best_estimator` instances can have a **different** set of hyperparameters. This function should only be used if you want to gain a deeper understanding of the different hyperparameter combinations and their performance. If you want to get the best-performing pipeline(s) to report in a paper, use :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.metric_summary` or :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.best_estimator_summary` instead. """ score_results = self.pipeline_score_results() score_summary_mean = ( score_results.groupby(score_results.index.names[:-1]) .agg(["mean", "std"]) .sort_values(by=(f"mean_test_{self.refit}", "mean"), ascending=False) ) return score_summary_mean
[docs] def best_hyperparameter_pipeline(self) -> pd.DataFrame: """Return the evaluation results for the pipeline with the best-performing hyperparameter set. This returns the pipeline with the **unique** hyperparameter combination that achieved the highest mean score over all outer folds. Notes ----- This `best pipeline` does not necessarily correspond to the overall best-performing pipeline as returned by :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.metric_summary` or :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.best_estimator_summary` because the best-performing pipelines are determined by averaging the `best_estimator` instances, as determined by `scikit-learn` over all folds. Hence, all `best_estimator` instances can have a **different** set of hyperparameters. This function returns the pipeline with the **unique** hyperparameter combination that achieved the highest mean score over all outer folds. This function should only be used if you want to gain a deeper understanding of the different hyperparameter combinations and their performance. If you want to get the best-performing pipeline(s) to report in a paper, use :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.metric_summary` or :func:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.best_estimator_summary` instead. Returns ------- :class:`~pandas.DataFrame` dataframe with the evaluation results of the best pipeline over all outer folds """ score_summary = self.pipeline_score_results() score_summary_mean = self.mean_pipeline_score_results() return score_summary.loc[score_summary_mean.index[0]].dropna(how="all", axis=1)
@staticmethod def _check_missing_params( model_dict: Dict[str, Dict[str, BaseEstimator]], param_dict: Dict[str, Optional[Union[Sequence[Dict[str, Any]], Dict[str, Any]]]], ): for category in model_dict: if not set(model_dict[category].keys()).issubset(set(param_dict.keys())): missing_params = list(set(model_dict[category].keys()) - set(param_dict.keys())) raise ValueError(f"Some estimators are missing parameters: {missing_params}")
[docs] def metric_summary_to_latex( self, data: Optional[pd.DataFrame] = None, metrics: Sequence[str] = None, pipeline_steps: Optional[Sequence[str]] = None, si_table_format: Optional[str] = None, highlight_best: Optional[Union[str, bool]] = None, **kwargs, ) -> str: """Return a latex table with the performance metrics of the pipeline combinations. By default, this function uses the attribute of the ``SklearnPipelinePermuter`` instance. If the ``data`` parameter is set, the function uses the dataframe passed as argument. Parameters ---------- data : :class:`~pandas.DataFrame`, optional dataframe with performance metrics if custom data should be used or ``None`` to use the attribute of the ``SklearnPipelinePermuter`` instance. Default: ``None`` metrics : list of str, optional list of metrics to include in the table or ``None`` to use all available metrics in the dataframe. Default: ``None`` pipeline_steps : list of str, optional list of pipeline steps to include in the table index or ``None`` to show all available pipeline steps as table index. Default: ``None`` si_table_format : str, optional table format for the ``siunitx`` package or ``None`` to use the default format. Default: ``None`` highlight_best : bool or str, optional Whether to highlight the pipeline with the best value in each column or not. * If ``highlight_best`` is a boolean, the best pipeline is highlighted in each column. * If ``highlight_best`` is a string, the best pipeline is highlighted in the column with the name **kwargs additional keyword arguments passed to :func:`~pandas.DataFrame.to_latex` """ kwargs.setdefault("clines", "skip-last;data") kwargs.setdefault("hrules", True) kwargs.setdefault("position", "ht!") kwargs.setdefault("position_float", "centering") kwargs.setdefault("siunitx", True) if si_table_format is None: si_table_format = "table-format = 2.1(2)" if data is None: data = self.metric_summary() metric_summary = data.copy() if pipeline_steps is None: if isinstance(metric_summary.index, pd.MultiIndex): pipeline_steps = list(metric_summary.index.names) else: pipeline_steps = [] if metrics is None: metrics = metric_summary.filter(like="mean_test").columns # extract metric names metrics = [m.split("_")[-1] for m in metrics] levels_to_drop = [step for step in metric_summary.index.names if step not in pipeline_steps] metric_summary = metric_summary.droplevel(levels_to_drop) metric_summary = metric_summary.rename(index=clf_map) list_metric_summary = [] for metric in metrics: list_metric_summary.append(metric_summary.filter(regex=f"(mean|std)_test_{metric}")) metric_summary = pd.concat(list_metric_summary, axis=1) # convert to percent metric_summary = metric_summary * 100 metric_summary_export = metric_summary.copy() for metric in metrics: mean_test = f"mean_test_{metric}" std_test = f"std_test_{metric}" m_sd = metric_summary_export.apply( lambda x, m_t=mean_test, std_t=std_test: rf"{x[m_t]:.1f}({x[std_t]:.1f})", axis=1 ) metric_summary_export = metric_summary_export.assign(**{metric: m_sd}) metric_summary_export = metric_summary_export[metrics].copy() if isinstance(metric_summary_export.index, pd.MultiIndex): metric_summary_export.index = metric_summary_export.index.rename(pipeline_step_map) metric_summary_export = metric_summary_export.rename(columns=metric_map) kwargs.setdefault("column_format", self._format_latex_column_format(metric_summary_export)) styler = styler = self._highlight_best(metric_summary, styler, highlight_best, metric_summary_export) metric_summary_tex = styler.to_latex(**kwargs) metric_summary_tex = self._apply_latex_code_correction(metric_summary_tex, si_table_format) return metric_summary_tex
@staticmethod def _format_latex_column_format(data: pd.DataFrame): column_format = "l" * data.index.nlevels if isinstance(data.columns, pd.MultiIndex): ncols = len(data.columns) ncols_last_level = len(data.columns.get_level_values(-1).unique()) column_format += ("S" * ncols_last_level + "|") * (ncols // ncols_last_level) # remove the last "|" column_format = column_format[:-1] else: column_format += "S" * len(data.columns) return column_format @staticmethod def _apply_latex_code_correction(table: str, si_table_format: str) -> str: if si_table_format is not None: table = re.sub(r"(\\begin\{tabular\})", r"\\sisetup{" + si_table_format + r"}\n\n\1", table) return table
[docs] def to_pickle(self, file_path: path_t) -> None: """Export the current instance as a pickle file. Parameters ---------- file_path : :class:`~pathlib.Path` or str file path to export """ file_path = Path(file_path) _assert_file_extension(file_path, ".pkl") with"wb") as f: pickle.dump(self, f)
[docs] @staticmethod def from_pickle(file_path: path_t) -> "SklearnPipelinePermuter": """Import a ``SklearnPipelinePermuter`` instance from a pickle file. Parameters ---------- file_path : :class:`~pathlib.Path` or str file path to import Returns ------- :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` ``SklearnPipelinePermuter` instance """ file_path = Path(file_path) _assert_file_extension(file_path, ".pkl") with"rb") as f: return pickle.load(f)
[docs] def update_permuter( self, model_dict: Optional[Dict[str, Dict[str, BaseEstimator]]] = None, param_dict: Optional[Dict[str, Any]] = None, hyper_search_dict: Optional[Dict[str, Dict[str, Any]]] = None, ) -> Self: """Update the ``SklearnPipelinePermuter`` instance with new model and parameter dictionaries. Parameters ---------- model_dict : dict, optional dictionary with model classes for each pipeline step param_dict : dict, optional dictionary with parameter grids for each pipeline step hyper_search_dict : dict, optional dictionary with hyperparameter search settings for each estimator Returns ------- :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` updated ``SklearnPipelinePermuter`` instance """ permuter = SklearnPipelinePermuter(model_dict, param_dict, hyper_search_dict) return SklearnPipelinePermuter._merge_permuter_params(self, permuter)
@classmethod def _merge_permuter_params(cls, permuter_01: Self, permuter_02: Self): # merge model dicts permuter_01.models = merge_nested_dicts(permuter_01.models, permuter_02.models) # merge hyperparameter search dicts permuter_01.hyper_search_dict = merge_nested_dicts(permuter_01.hyper_search_dict, permuter_02.hyper_search_dict) # merge hyperparameter dicts permuter_01.param_searches = merge_nested_dicts(permuter_01.param_searches, permuter_02.param_searches) permuter_01.params = merge_nested_dicts(permuter_01.params, permuter_02.params) # merge model combinations permuter_01.model_combinations += permuter_02.model_combinations permuter_01.model_combinations = list(set(permuter_01.model_combinations)) return permuter_01
[docs] @classmethod def merge_permuter_instances(cls, permuter: Union[Sequence[Self], Sequence[path_t]]) -> Self: """Merge two (or more) :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` instances. This function expects at least two ``SklearnPipelinePermuter`` instances to merge. The function first performs a deep copy of the first instance and then merges all attributes of the remaining ``permuter`` instance with the copy. The ``permuter`` instances passed to this function are not modified. Parameters ---------- permuter : list of :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` instances or list of file paths to pickled `SklearnPipelinePermuter` instances Returns ------- :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` merged ``SklearnPipelinePermuter`` instance """ # ensure that permuter contains at least two instances if len(permuter) < 2: raise ValueError("At least two SklearnPipelinePermuter instances must be passed to this function.") if all(isinstance(p, (str, Path)) for p in permuter): permuter = [cls.from_pickle(p) for p in permuter] # make deep copy of first instance base_permuter = deepcopy(permuter[0]) for p in permuter[1:]: if base_permuter.scoring != p.scoring: raise ValueError( f"Cannot merge permuter instances with different scoring functions: " f"{base_permuter.scoring} vs. {p.scoring}" ) if base_permuter.refit != p.refit: raise ValueError( f"Cannot merge permuter instances with different refit options: {base_permuter.refit} vs. {p.refit}" ) SklearnPipelinePermuter._merge_permuter_params(base_permuter, p) # merge results dataframes results_concat = pd.concat([base_permuter.results, p.results], axis=0) param_cols = list(results_concat.filter(like="param_").columns) # drop duplicate parameter combinations in results results_concat = results_concat.reset_index("outer_fold").drop_duplicates( subset=["outer_fold", *param_cols] ) results_concat = results_concat.set_index("outer_fold", append=True) base_permuter.results = results_concat return base_permuter
@staticmethod def _apply_score(row: pd.Series, score_func, pos_label: str): true_labels_folds = row[0] predicted_labels_folds = row[1] kwargs = {} params = signature(score_func).parameters if "pos_label" in params: kwargs["pos_label"] = pos_label if "zero_division" in params: kwargs["zero_division"] = 0 scores = [ score_func(true_labels, predicted_labels, **kwargs) for true_labels, predicted_labels in zip(true_labels_folds, predicted_labels_folds) ] return pd.Series(scores)
[docs] def compute_additional_metrics(self, metric_summary: pd.DataFrame, metrics: str_t, pos_label: str) -> pd.DataFrame: """Compute additional classification metrics. Parameters ---------- metric_summary : :class:`~pandas.DataFrame` metric summary from :meth:`~biopsykit.classification.model_selection.SklearnPipelinePermuter.metric_summary` metrics : str or list of str metric(s) to compute pos_label : str positive label for binary classification Returns ------- :class:`~pandas.DataFrame` metric summary with additional metrics computed """ if isinstance(metrics, str): metrics = [metrics] metric_slice = metric_summary[["true_labels_folds", "predicted_labels_folds"]].copy() metric_out = {} score_funcs = dict(getmembers(sklearn.metrics)) for metric in metrics: if metric.endswith("_score"): score_name = metric # strip '_score' suffix from metric name for column name metric = metric.replace("_score", "") # noqa: PLW2901 else: # name for calling sklearn metric function score_name = metric + "_score" if score_name in score_funcs: score_func = score_funcs[score_name] else: raise ValueError(f"Metric '{metric}' not found.") metric_out[metric] = metric_slice.apply(self._apply_score, args=(score_func, pos_label), axis=1) metric_out = pd.concat(metric_out, names=["score", "folds"], axis=1) metric_out = metric_out.stack(["score", "folds"]) metric_out = metric_out.groupby(metric_out.index.names[:-1]).agg( [("mean", lambda x: np.mean(x)), ("std", lambda x: np.std(x))] ) metric_out = metric_out.unstack("score").sort_index(axis=1, level="score") metric_out.columns ="_test_".join) metric_summary = metric_summary.join(metric_out) # resort columns so that all "mean_test_*" and "std_test_*" columns are at the end cols = list(metric_summary.filter(regex="^(?!mean_test_|std_test_).*$").columns) cols += list(metric_summary.filter(regex="^(mean_test_|std_test_).*$").columns) metric_summary = metric_summary[cols] return metric_summary
@staticmethod def _highlight_best(metric_summary, styler, highlight_best, metric_summary_export): if isinstance(highlight_best, str): max_metric = metric_summary[f"mean_test_{highlight_best}"].idxmax() # get index of max metric max_metric = metric_summary_export.index.get_loc(max_metric) styler = styler.highlight_max(subset=metric_map[highlight_best], props="bfseries: ;") # get maximum of metric_summary # make index bold styler = styler.apply_index(lambda x: np.where(x.index == max_metric, "bfseries: ;", "")) elif isinstance(highlight_best, bool) and highlight_best: styler = styler.highlight_max(props="bfseries: ;") return styler