Source code for biopsykit.classification.analysis._analysis

"""Functions to analyze classification results."""
from typing import Any, Dict, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd
import seaborn as sns
from biopsykit.classification.model_selection import SklearnPipelinePermuter
from biopsykit.classification.utils import prepare_df_sklearn
from fau_colors import cmaps
from matplotlib import pyplot as plt
from matplotlib.cm import register_cmap
from matplotlib.colors import ListedColormap
from sklearn.metrics import ConfusionMatrixDisplay

pipeline_step_map = {
    "pipeline_scaler": "Scaler",
    "pipeline_reduce_dim": r"\makecell[lc]{Feature\\ Selection}",
    "pipeline_clf": "Classifier",
}

metric_map = {
    "accuracy": r"\makecell{Accuracy [\%]}",
    "f1": r"\makecell{F1-score [\%]}",
    "precision": r"\makecell{Precision [\%]}",
    "recall": r"\makecell{Recall [\%]}",
    "auc": r"\makecell{AUC [\%]}",
    "sensitivity": r"\makecell{Sensitivity [\%]}",
    "specificity": r"\makecell{Specificity [\%]}",
}

clf_map = {
    "MinMaxScaler": "Min-Max",
    "StandardScaler": "Standard",
    "SelectKBest": "SkB",
    "RFE": "RFE",
    "GaussianNB": "NB",
    "KNeighborsClassifier": "kNN",
    "DecisionTreeClassifier": "DT",
    "SVC": "SVM",
    "RandomForestClassifier": "RF",
    "MLPClassifier": "MLP",
    "AdaBoostClassifier": "Ada",
}


[docs]def predictions_as_df( pipeline_permuter: SklearnPipelinePermuter, data: pd.DataFrame, pipeline: Tuple[str], label_mapping: Optional[Dict[str, str]] = None, index_col: Optional[str] = None, ) -> pd.DataFrame: """Get predictions from a specified pipeline and merge them with the index of the input dataframe. Parameters ---------- pipeline_permuter : :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` instance data : :class:`~pandas.DataFrame` input data pipeline : tuple pipeline to get predictions from label_mapping : dict, optional mapping of labels to rename labels in the output dataframe or ``None`` to keep original labels. Default: ``None`` index_col : str, optional name of the index column to merge the predictions with. If ``data`` has a multi-index, the first level is used unless ``index_col`` is specified. Default: ``None`` Returns ------- :class:`~pandas.DataFrame` predictions as dataframe """ metric_summary = pipeline_permuter.metric_summary() label_cols = ["true_labels", "predicted_labels"] predictions = metric_summary[label_cols].explode(label_cols).loc[pipeline] if isinstance(data.index, pd.MultiIndex): if index_col is None: index_col = data.index.names[0] index_vals = data.index.get_level_values(index_col) else: index_vals = data.index predictions.index = index_vals if label_mapping: predictions = predictions.replace(label_mapping) return predictions
[docs]def predict_proba_from_estimator( pipeline_permuter: SklearnPipelinePermuter, data: pd.DataFrame, pipeline: Tuple[str], label_col: Optional[str] = "label", column_names: Optional[Dict[str, str]] = None, ) -> pd.DataFrame: """Get predictions as probabilities from a specified pipeline and merge them with the index of the input dataframe. Parameters ---------- pipeline_permuter : :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` :class:`~biopsykit.classification.model_selection.SklearnPipelinePermuter` instance data : :class:`~pandas.DataFrame` input data pipeline : tuple pipeline to get predictions from label_col : str, optional name of the label column in the input dataframe. Default: ``"label"`` column_names : dict, optional mapping of column names to rename columns in the output dataframe or ``None`` to keep original column names. Default: ``None`` Returns ------- :class:`~pandas.DataFrame` dataframe with predictions as probabilities """ metric_summary = pipeline_permuter.metric_summary() best_pipeline = pipeline_permuter.best_estimator_summary() best_pipeline = best_pipeline.loc[pipeline].iloc[0].pipeline test_indices = metric_summary.loc[pipeline]["test_indices_folds"] test_indices_flat = list(metric_summary.loc[pipeline]["test_indices"]) x, y, _, _ = prepare_df_sklearn(data, label_col=label_col, print_summary=False) label_order = best_pipeline[0].classes_ predict_proba_results = [] predict_proba_labels = [] for i, test_idx in enumerate(test_indices): test_idx_list = list(test_idx) pipeline_fold = best_pipeline[i] predict_proba_results.append(pipeline_fold.predict_proba(x[test_idx_list])) predict_proba_labels.append(y[test_idx_list]) results_proba = pd.DataFrame( np.concatenate(predict_proba_results), columns=label_order, index=data.index[test_indices_flat], ) if column_names is not None: results_proba = results_proba.rename(columns=column_names) results_proba = results_proba.sort_index() results_proba = results_proba.round(4) return results_proba
def _conf_matrix_from_proba_df( data: pd.DataFrame, label_col: Optional[str] = "label", label_order: Optional[Sequence[str]] = None ) -> pd.DataFrame: """Get confusion matrix from a dataframe with predictions as probabilities. Parameters ---------- data : :class:`~pandas.DataFrame` dataframe with predictions as probabilities label_col : str, optional name of the label column in the input dataframe. Default: ``"label"`` label_order : list, optional order of labels in the confusion matrix or ``None`` to use the order of the labels in the input dataframe. Default: ``None`` Returns ------- :class:`~pandas.DataFrame` confusion matrix as dataframe """ if label_order is None: label_order = list(data.columns) conf_matrix_proba = data.groupby(label_col).mean() conf_matrix_proba = conf_matrix_proba.reindex(label_order, axis=0).reindex(label_order, axis=1) return conf_matrix_proba def _register_fau_r(): fau_r = sns.color_palette(cmaps.fau) fau_r.reverse() fau_r = ListedColormap(fau_r, "fau_r") if "fau_r" not in plt.colormaps(): register_cmap("fau_r", fau_r)
[docs]def plot_conf_matrix( predictions: pd.DataFrame, labels: Sequence[str], label_name: Optional[str] = "label", conf_matrix_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> Tuple[plt.Figure, plt.Axes]: """Plot confusion matrix from predictions. Parameters ---------- predictions : :class:`~pandas.DataFrame` dataframe with predictions labels : list, dict, optional list of labels to use in the confusion matrix or dictionary with label names in the data frame as key and the corresponding label names to use in the confusion matrix as value. Default: ``None`` to use the labels in the data frame in the order they appear label_name : str, optional name of the 'label' in the axis titles. Default: "label" to yield "True label" and "Predicted label" conf_matrix_kwargs : dict, optional additional keyword arguments to pass to :func:`~sklearn.metrics.ConfusionMatrixDisplay.from_predictions` **kwargs additional keyword arguments to pass to :func:`plt.subplots` """ if conf_matrix_kwargs is None: conf_matrix_kwargs = {} # check if ax is given ax = kwargs.get("ax", None) if ax is None: fig, ax = plt.subplots(**kwargs) else: fig = ax.get_figure() predictions = predictions.copy() if isinstance(labels, dict): # only replace in true_labels and predicted_labels columns predictions[["true_labels", "predicted_labels"]] = predictions[["true_labels", "predicted_labels"]].replace( labels ) labels = list(labels.values()) if not conf_matrix_kwargs.get("cmap", None): # check if fau_r is registered as colormap if "fau_r" not in plt.colormaps(): _register_fau_r() conf_matrix_kwargs["cmap"] = "fau_r" conf_matrix_kwargs.setdefault("colorbar", False) ConfusionMatrixDisplay.from_predictions( predictions["true_labels"], predictions["predicted_labels"], labels=labels, ax=ax, **conf_matrix_kwargs, ) if kwargs.get("despine", True): for spine in ["top", "bottom", "left", "right"]: ax.spines[spine].set_color("None") ax.set_ylabel(f"True {label_name}") ax.set_xlabel(f"Predicted {label_name}") return fig, ax
[docs]def plot_conf_matrix_proba( predictions: pd.DataFrame, labels: Sequence[str], label_col: Optional[str] = "label", label_name: Optional[str] = "label", **kwargs, ) -> Tuple[plt.Figure, plt.Axes]: """Plot confusion matrix from prediction probabilities. Parameters ---------- predictions : :class:`~pandas.DataFrame` dataframe with predictions as probabilities labels : list list of labels label_col : str, optional name of the label column in the input dataframe. Default: ``"label"`` label_name : str, optional name of the 'label' in the axis titles. Default: "label" to yield "True label" and "Predicted label" **kwargs additional keyword arguments to pass to :func:`plt.subplots` """ # check if ax is given ax = kwargs.get("ax", None) if ax is None: fig, ax = plt.subplots(**kwargs) else: fig = ax.get_figure() # check if fau_r is registered as colormap if "fau_r" not in plt.colormaps(): _register_fau_r() conf_matrix_proba = _conf_matrix_from_proba_df(predictions, label_col=label_col, label_order=labels) sns.heatmap(conf_matrix_proba, cmap="fau_r", annot=True, cbar=False, square=True, ax=ax) ax.set_ylabel(f"True {label_name}") ax.set_xlabel(f"Predicted {label_name}") return fig, ax
[docs]def metric_summary_to_latex( permuter_or_df: Union[SklearnPipelinePermuter, pd.DataFrame], metrics: Sequence[str] = None, pipeline_steps: Optional[Sequence[str]] = None, si_table_format: Optional[str] = None, highlight_best: Optional[str] = None, **kwargs, ) -> str: """Return a latex table with the performance metrics of the pipeline combinations. Notes ----- This method is a legacy method that is kept for backwards compatibility with older pickled instances of the :class:`SklearnPipelinePermuter` class. It is recommended to use the :meth:`SklearnPipelinePermuter.metric_summary` method instead. See Also -------- :meth:`SklearnPipelinePermuter.metric_summary_to_latex` Parameters ---------- permuter_or_df : :class:`SklearnPipelinePermuter` or :class:`~pandas.DataFrame` :class:`SklearnPipelinePermuter` instance or dataframe with performance metrics. metrics : list of str, optional list of metrics to include in the table or ``None`` to use all available metrics in the dataframe. Default: ``None`` pipeline_steps : list of str, optional list of pipeline steps to include in the table index or ``None`` to show all available pipeline steps as table index. Default: ``None`` si_table_format : str, optional table format for the ``siunitx`` package or ``None`` to use the default format. Default: ``None`` highlight_best : bool or str, optional Whether to highlight the pipeline with the best value in each column or not. * If ``highlight_best`` is a boolean, the best pipeline is highlighted in each column. * If ``highlight_best`` is a string, the best pipeline is highlighted in the column with the name **kwargs additional keyword arguments passed to :func:`~pandas.DataFrame.to_latex` """ dummy_permuter = SklearnPipelinePermuter() if isinstance(permuter_or_df, SklearnPipelinePermuter): metric_summary = permuter_or_df.metric_summary else: metric_summary = permuter_or_df return dummy_permuter.metric_summary_to_latex( metric_summary, metrics=metrics, pipeline_steps=pipeline_steps, si_table_format=si_table_format, highlight_best=highlight_best, **kwargs, )