Source code for biopsykit.io.saliva

"""Module wrapping biopsykit.io.biomarker including only I/O functions for saliva data."""
from pathlib import Path
from typing import Dict, Optional, Sequence, Tuple, Union

import pandas as pd
from biopsykit.io import biomarker

__all__ = ["load_saliva_plate", "save_saliva", "load_saliva_wide_format"]

from biopsykit.utils._types import path_t
from biopsykit.utils.datatype_helper import SalivaRawDataFrame, SubjectConditionDataFrame


[docs]def load_saliva_plate(
    file_path: path_t,
    saliva_type: str,
    sample_id_col: Optional[str] = None,
    data_col: Optional[str] = None,
    id_col_names: Optional[Sequence[str]] = None,
    regex_str: Optional[str] = None,
    sample_times: Optional[Sequence[int]] = None,
    condition_list: Optional[Union[Sequence, Dict[str, Sequence], pd.Index]] = None,
    **kwargs,
) -> SalivaRawDataFrame:
    r"""Read saliva from an Excel sheet in 'plate' format.

    This function automatically extracts identifier like subject, day and sample IDs from the saliva sample names.
    To extract them, a regular expression string can be passed via ``regex_str``.

    Here are some examples on how sample identifiers might look like and what the corresponding
    ``regex_str`` would output:

        * "Vp01 S1"
          => ``r"(Vp\d+) (S\d)"`` (this is the default pattern, you can also just set ``regex_str`` to ``None``)
          => data ``[Vp01, S1]`` in two columns: ``subject``, ``sample``
          (unless column names are explicitly specified in ``data_col_names``)
        * "Vp01 T1 S1" ... "Vp01 T1 S5" (only *numeric* characters in day/sample)
          => ``r"(Vp\d+) (T\d) (S\d)"``
          => three columns: ``subject``, ``sample`` with data ``[Vp01, T1, S1]``
          (unless column names are explicitly specified in ``data_col_names``)
        * "Vp01 T1 S1" ... "Vp01 T1 SA" (also *letter* characters in day/sample)
          => ``r"(Vp\d+) (T\w) (S\w)"``
          => three columns: ``subject``, ``sample`` with data ``[Vp01, T1, S1]``
          (unless column names are explicitly specified in ``data_col_names``)

    If you **don't** want to extract the 'S' or 'T' prefixes in saliva or day IDs, respectively,
    you have to move it **out** of the capture group in the ``regex_str`` (round brackets), like this:
    ``(S\d)`` (would give ``S1``, ``S2``, ...)
    => ``S(\d)`` (would give ``1``, ``2``, ...)


    Parameters
    ----------
    file_path: :class:`~pathlib.Path` or str
        path to the Excel sheet in 'plate' format containing saliva data
    saliva_type: str
        saliva type to load from file
    sample_id_col: str, optional
        column name of the Excel sheet containing the sample ID. Default: "sample ID"
    data_col: str, optional
        column name of the Excel sheet containing saliva data to be analyzed.
        Default: Select default column name based on ``biomarker_type``, e.g. ``cortisol`` => ``cortisol (nmol/l)``
    id_col_names: list of str, optional
        names of the extracted ID column names. ``None`` to use the default column names (['subject', 'day', 'sample'])
    regex_str: str, optional
        regular expression to extract subject ID, day ID and sample ID from the sample identifier.
        ``None`` to use default regex string (``r"(Vp\d+) (S\d)"``)
    sample_times: list of int, optional
        times at which saliva samples were collected
    condition_list: 1d-array, optional
        list of conditions which subjects were assigned to
    **kwargs
        Additional parameters that are passed to :func:`pandas.read_excel`

    Returns
    -------
    data : :class:`~biopsykit.utils.datatype_helper.SalivaRawDataFrame`
        saliva data in `SalivaRawDataFrame` format

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.FileExtensionError`
        if file is no Excel file (.xls or .xlsx)
    ValueError
        if any saliva sample can not be converted into a float (e.g. because there was text in one of the columns)
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if imported data can not be parsed to a SalivaRawDataFrame

    """
    return biomarker.load_saliva_plate(
        file_path, saliva_type, sample_id_col, data_col, id_col_names, regex_str, sample_times, condition_list, **kwargs
    )


[docs]def save_saliva(
    file_path: path_t,
    data: SalivaRawDataFrame,
    saliva_type: Optional[str] = "cortisol",
    as_wide_format: Optional[bool] = False,
):
    """Save saliva data to csv file.

    Parameters
    ----------
    file_path: :class:`~pathlib.Path` or str
        file path to export. Must be a csv or an Excel file
    data : :class:`~biopsykit.utils.datatype_helper.SalivaRawDataFrame`
        saliva data in `SalivaRawDataFrame` format
    saliva_type : str
        type of saliva data in the dataframe
    as_wide_format : bool, optional
        ``True`` to save data in wide format (and flatten all index levels), ``False`` to save data in long-format.
        Default: ``False``

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.ValidationError`
        if ``data`` is not a SalivaRawDataFrame
    :exc:`~biopsykit.utils.exceptions.FileExtensionError`
        if ``file_path`` is not a csv or Excel file

    """
    biomarker.save_saliva(file_path, data, saliva_type, as_wide_format)


[docs]def load_saliva_wide_format(
    file_path: path_t,
    saliva_type: str,
    subject_col: Optional[str] = None,
    condition_col: Optional[str] = None,
    additional_index_cols: Optional[Union[str, Sequence[str]]] = None,
    sample_times: Optional[Sequence[int]] = None,
    **kwargs,
) -> SalivaRawDataFrame:
    """Load saliva data that is in wide-format from csv file.

    It will return a `SalivaRawDataFrame`, a long-format dataframe that complies with BioPsyKit's naming convention,
    i.e., the subject ID index will be named ``subject``, the sample index will be names ``sample``,
    and the value column will be named after the saliva biomarker type.

    Parameters
    ----------
    file_path: :class:`~pathlib.Path` or str
        path to file
    saliva_type: str
        saliva type to load from file. Example: ``cortisol``
    subject_col: str, optional
        name of column containing subject IDs or ``None`` to use the default column name ``subject``.
        According to BioPsyKit's convention, the subject ID column is expected to have the name ``subject``.
        If the subject ID column in the file has another name, the column will be renamed in the dataframe
        returned by this function. Default: ``None``
    condition_col : str, optional
        name of the column containing condition assignments or ``None`` if no conditions are present.
        According to BioPsyKit's convention, the condition column is expected to have the name ``condition``.
        If the condition column in the file has another name, the column will be renamed in the dataframe
        returned by this function. Default: ``None``
    additional_index_cols : str or list of str, optional
        additional index levels to be added to the dataframe, e.g., "day" index. Can either be a string or a list
        strings to indicate column name(s) that should be used as index level(s),
        or ``None`` for no additional index levels. Default: ``None``
    sample_times: list of int, optional
        times at which saliva samples were collected or ``None`` if no sample times should be specified.
        Default: ``None``
    **kwargs
        Additional parameters that are passed to :func:`pandas.read_csv` or :func:`pandas.read_excel`

    Returns
    -------
    data : :class:`~biopsykit.utils.datatype_helper.SalivaRawDataFrame`
        saliva data in `SalivaRawDataFrame` format

    Raises
    ------
    :exc:`~biopsykit.utils.exceptions.FileExtensionError`
        if file is no csv or Excel file

    """
    return biomarker.load_saliva_wide_format(
        file_path, saliva_type, subject_col, condition_col, additional_index_cols, sample_times, **kwargs
    )


def _get_index_cols(condition_col: str, index_cols: Sequence[str], additional_index_cols: Sequence[str]):
    return biomarker._get_index_cols(condition_col, index_cols, additional_index_cols)


def _read_dataframe(file_path: Path, **kwargs):
    return biomarker._read_dataframe(file_path, **kwargs)


def _check_num_samples(num_samples: int, num_subjects: int):
    """Check that number of imported samples is the same for all subjects.

    Parameters
    ----------
    num_samples : int
        total number of saliva samples in the current dataframe
    num_subjects : int
        total number of subjects in the current dataframe

    Raises
    ------
    ValueError
        if number of samples is not equal for all subjects

    """
    biomarker._check_num_samples(num_samples, num_subjects)


def _check_sample_times(num_samples: int, num_subjects: int, sample_times: Sequence[int]):
    """Check that sample times have the correct number of samples and are monotonously increasing.

    Parameters
    ----------
    num_samples : int
        total number of saliva samples in the current dataframe
    num_subjects : int
        total number of subjects in the current dataframe
    sample_times : array-like
        list of sample times

    Raises
    ------
    ValueError
        if values in ``sample_times`` are not monotonously increasing or
        if number of saliva times does not match the number of saliva samples per subject

    """
    biomarker._check_sample_times(num_samples, num_subjects, sample_times)


def _parse_condition_list(
    data: pd.DataFrame, condition_list: Union[Sequence, Dict[str, Sequence], pd.Index]
) -> SubjectConditionDataFrame:
    return biomarker._parse_condition_list(data, condition_list)


def _apply_condition_list(
    data: pd.DataFrame,
    condition_list: Optional[Union[Sequence, Dict[str, Sequence], pd.Index]] = None,
):
    return biomarker._apply_condition_list(data, condition_list)


def _get_id_columns(id_col_names: Sequence[str], extracted_cols: pd.DataFrame):

    return biomarker._get_id_columns(id_col_names, extracted_cols)


def _get_condition_col(data: pd.DataFrame, condition_col: str) -> Tuple[pd.DataFrame, str]:
    return biomarker._get_condition_col(data, condition_col)