Source code for biopsykit.stats.regression

"""Functions for performing regression analysis."""
from typing import Union

import pandas as pd
import pingouin as pg

__all__ = ["stepwise_backwards_linear_regression"]


[docs]def stepwise_backwards_linear_regression(predictors: pd.DataFrame, dv: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
    """Perform a stepwise backwards linear regression.

    The stepwise backwards linear regression is performed iteratively by running a linear regression on the predictors
    and removing the predictor with the highest p-value. The best-fitting model is obtained by choosing the regression
    model with the highest adjusted R-squared value.

    Parameters
    ----------
    predictors : :class:`pandas.DataFrame`
        Dataframe with predictors for the regression.
    dv : :class:`pandas.DataFrame` or :class:`pandas.Series`
        Dependent variable for the regression.

    Returns
    -------
    :class:`pandas.DataFrame`
        Best-fitted regression model.

    """
    # dv must only be a single column
    if dv.ndim > 1:
        if dv.shape[-1] != 1:
            raise ValueError("dv must be a single column")
        dv = dv.iloc[:, 0]

    list_adj_r2 = []
    list_reg_models = []
    while len(predictors.columns) > 0:
        reg_results = _lin_reg(predictors, dv)
        # drop the most correlated predictor
        drop_col = reg_results["pval"].idxmax()
        predictors = predictors.drop(columns=drop_col)
        adj_r2 = reg_results["adj_r2"].iloc[0]
        list_adj_r2.append(adj_r2)
        list_reg_models.append(reg_results)

    # get the best regression model, i.e. the model with the highest adjusted R2
    return list_reg_models[list_adj_r2.index(max(list_adj_r2))]


def _lin_reg(predictors: pd.DataFrame, dv: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
    return pg.linear_regression(predictors, dv).set_index("names").drop(index="Intercept")