Source code for biopsykit.stats.regression

"""Functions for performing regression analysis."""
from typing import Union

import pandas as pd
import pingouin as pg

__all__ = ["stepwise_backwards_linear_regression"]


[docs]def stepwise_backwards_linear_regression(predictors: pd.DataFrame, dv: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: """Perform a stepwise backwards linear regression. The stepwise backwards linear regression is performed iteratively by running a linear regression on the predictors and removing the predictor with the highest p-value. The best-fitting model is obtained by choosing the regression model with the highest adjusted R-squared value. Parameters ---------- predictors : :class:`pandas.DataFrame` Dataframe with predictors for the regression. dv : :class:`pandas.DataFrame` or :class:`pandas.Series` Dependent variable for the regression. Returns ------- :class:`pandas.DataFrame` Best-fitted regression model. """ # dv must only be a single column if dv.ndim > 1: if dv.shape[-1] != 1: raise ValueError("dv must be a single column") dv = dv.iloc[:, 0] list_adj_r2 = [] list_reg_models = [] while len(predictors.columns) > 0: reg_results = _lin_reg(predictors, dv) # drop the most correlated predictor drop_col = reg_results["pval"].idxmax() predictors = predictors.drop(columns=drop_col) adj_r2 = reg_results["adj_r2"].iloc[0] list_adj_r2.append(adj_r2) list_reg_models.append(reg_results) # get the best regression model, i.e. the model with the highest adjusted R2 return list_reg_models[list_adj_r2.index(max(list_adj_r2))]
def _lin_reg(predictors: pd.DataFrame, dv: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: return pg.linear_regression(predictors, dv).set_index("names").drop(index="Intercept")